In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt
import seaborn as sns

# Library for data preprocessing
import fuzzywuzzy
from fuzzywuzzy import process

# Library for machine learning
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

# Utils
from tqdm import tqdm
import missingno as msno
import optuna
import warnings
warnings.filterwarnings('ignore')
d:\category_dtype_hanoi_saleprice\venv\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Config¶

In [2]:
def replace_matches_in_column(df, column, string_to_match, min_ratio):
    '''
    Replace all rows in a column that match a string greater than threshold
    '''
    strings = df[column].unique()
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > min_ratio
    # matches[1:] because maches[0] alawys = (string_to_match, 100)
    close_matches = [matches[0] for matches in matches[1:] if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)
    for index, row in df[rows_with_matches].iterrows():
        print(index, row[column], '->', string_to_match)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
In [3]:
# Extract index and value from a pandas series

def extract_idx_value(pd_series):
    idx = pd_series.index
    value = pd_series.values
    return idx, value
In [4]:
# Custom Classifier

class Classifier:
    '''
    A class to train and evaluate multiple classifiers with cross validation
    '''
    def __init__(self, n_fold=5, random_state=42, use_gpu=True):
        self.classifiers = {
            'Linear Model': LinearRegression(),
            'Decision Tree': DecisionTreeRegressor(random_state=random_state),
            'KNN': KNeighborsRegressor(),
            'Random Forest': RandomForestRegressor(random_state=random_state),
            'XGBoost': XGBRegressor(random_state=random_state, tree_method='gpu_hist') if use_gpu else XGBRegressor(random_state=random_state),
        }
        self.n_fold = n_fold

        self.scores_rmse = {key: 0 for key in self.classifiers.keys()}
        self.std_rmse = {key: 0 for key in self.classifiers.keys()}
        self.scores_mae = {key: 0 for key in self.classifiers.keys()}
        self.std_mae = {key: 0 for key in self.classifiers.keys()}
    
    def fit(self, X, y):
        kf = KFold(n_splits=self.n_fold)
        for classifier in tqdm(self.classifiers):
            scores_rmse = []
            scores_mae = []
            for train_index, val_index in kf.split(X, y):
                X_train, y_train = X.iloc[train_index], y.iloc[train_index]
                X_val, y_val = X.iloc[val_index], y.iloc[val_index]

                self.classifiers[classifier].fit(X_train, y_train)

                y_pred = self.classifiers[classifier].predict(X_val)
                scores_rmse.append(mean_squared_error(y_val, y_pred, squared=False))
                scores_mae.append(mean_squared_error(y_val, y_pred, squared=True))

            self.scores_rmse[classifier] = np.mean(scores_rmse)
            self.std_rmse[classifier] = np.std(scores_rmse)

            self.scores_mae[classifier] = np.mean(scores_mae)
            self.std_mae[classifier] = np.std(scores_mae)
            
    def summary(self):
        return pd.DataFrame({
            'Score RMSE': self.scores_rmse,
            'Std RMSE': self.std_rmse,
            'Score MAE': self.scores_mae,
            'Std MAE': self.std_mae
        })
In [5]:
pd.set_option('display.float_format', lambda x: '%.9f' % x)
In [6]:
DATA_PATH = 'VN_housing_dataset.csv'
In [7]:
# Import data and show the first 5 rows

df = pd.read_csv(DATA_PATH)
df.head()
Out[7]:
Unnamed: 0 Ngày Địa chỉ Quận Huyện Loại hình nhà ở Giấy tờ pháp lý Số tầng Số phòng ngủ Diện tích Dài Rộng Giá/m2
0 0.000000000 2020-08-05 Đường Hoàng Quốc Việt, Phường Nghĩa Đô, Quận C... Quận Cầu Giấy Phường Nghĩa Đô Nhà ngõ, hẻm Đã có sổ 4 5 phòng 46 m² NaN NaN 86,96 triệu/m²
1 1.000000000 2020-08-05 Đường Kim Giang, Phường Kim Giang, Quận Thanh ... Quận Thanh Xuân Phường Kim Giang Nhà mặt phố, mặt tiền NaN NaN 3 phòng 37 m² NaN NaN 116,22 triệu/m²
2 2.000000000 2020-08-05 phố minh khai, Phường Minh Khai, Quận Hai Bà T... Quận Hai Bà Trưng Phường Minh Khai Nhà ngõ, hẻm Đã có sổ 4 4 phòng 40 m² 10 m 4 m 65 triệu/m²
3 3.000000000 2020-08-05 Đường Võng Thị, Phường Thụy Khuê, Quận Tây Hồ,... Quận Tây Hồ Phường Thụy Khuê Nhà ngõ, hẻm Đã có sổ NaN 6 phòng 51 m² 12.75 m 4 m 100 triệu/m²
4 4.000000000 2020-08-05 Đường Kim Giang, Phường Kim Giang, Quận Thanh ... Quận Thanh Xuân Phường Kim Giang Nhà ngõ, hẻm NaN NaN 4 phòng 36 m² 9 m 4 m 86,11 triệu/m²
In [8]:
# Check info of the dataset

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82497 entries, 0 to 82496
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       82496 non-null  float64
 1   Ngày             82496 non-null  object 
 2   Địa chỉ          82449 non-null  object 
 3   Quận             82495 non-null  object 
 4   Huyện            82449 non-null  object 
 5   Loại hình nhà ở  82465 non-null  object 
 6   Giấy tờ pháp lý  53610 non-null  object 
 7   Số tầng          36399 non-null  object 
 8   Số phòng ngủ     82458 non-null  object 
 9   Diện tích        82495 non-null  object 
 10  Dài              19827 non-null  object 
 11  Rộng             35445 non-null  object 
 12  Giá/m2           82484 non-null  object 
dtypes: float64(1), object(12)
memory usage: 8.2+ MB
In [9]:
# Drop duplicates rows

df.drop_duplicates(inplace=True)
In [10]:
# Drop rows with missing all values

df.dropna(axis=0, how='all', inplace=True)
In [11]:
# Rename columns for easier use

column_names = {'Ngày': 'Date', 'Địa chỉ': 'Address', 'Quận': 'District', 'Huyện': 'Ward',
                'Loại hình nhà ở': 'Type_of_housing', 'Giấy tờ pháp lý': 'Legal_document',
                'Số tầng': 'Number_of_floors', 'Số phòng ngủ': 'Number_of_bedrooms',
                'Diện tích': 'Area', 'Dài': 'Length', 'Rộng': 'Width', 'Giá/m2': 'Price_per_m2'}

df.rename(columns=column_names, inplace=True)
In [12]:
# Drop rows with missing values in 'Price_per_m2' column because it is the target column

df.dropna(axis=0, subset=['Price_per_m2'], inplace=True)
In [13]:
# Drop Unnamed: 0 column

df.drop(['Unnamed: 0'], axis=1, inplace=True)
In [14]:
# Reset index

df.reset_index(drop=True, inplace=True)

Train-set and test-set split before data preprocessing to avoid data leakage¶

In [15]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

EDA¶

Process missing values¶

In [16]:
# Missing values in each column

df_train.isna().sum()
Out[16]:
Date                      0
Address                  39
District                  0
Ward                     43
Type_of_housing          26
Legal_document        23092
Number_of_floors      36888
Number_of_bedrooms       30
Area                      0
Length                50113
Width                 37638
Price_per_m2              0
dtype: int64
In [17]:
msno.matrix(df_train, figsize = (30,10))
plt.title('Missing value in dataset')
plt.show()
In [18]:
# Missing values ratio in each column in train set

ratio_nan_per_col = df_train.isna().sum() / len(df_train)
ratio_nan_per_col
Out[18]:
Date                 0.000000000
Address              0.000591026
District             0.000000000
Ward                 0.000651644
Type_of_housing      0.000394017
Legal_document       0.349947717
Number_of_floors     0.559019201
Number_of_bedrooms   0.000454635
Area                 0.000000000
Length               0.759437465
Width                0.570385076
Price_per_m2         0.000000000
dtype: float64
In [19]:
print('Number of rows before dropping missing values:', len(df_train))
Number of rows before dropping missing values: 65987
In [20]:
# Drop rows with missing values greater than threshold

threshold = 0.01
df_train.dropna(subset=df_train.columns[ratio_nan_per_col < threshold], inplace=True)
In [21]:
print('Number of rows after dropping missing values:', len(df_train))
Number of rows after dropping missing values: 65883
In [22]:
# Missing values ratio in each column in test set

ratio_nan_per_col = df_test.isna().sum() / len(df_test)
ratio_nan_per_col
Out[22]:
Date                 0.000000000
Address              0.000484937
District             0.000060617
Ward                 0.000242468
Type_of_housing      0.000303085
Legal_document       0.350972904
Number_of_floors     0.557737771
Number_of_bedrooms   0.000484937
Area                 0.000000000
Length               0.760623144
Width                0.570224889
Price_per_m2         0.000000000
dtype: float64
In [23]:
# Because this is a test set, we can't drop rows with missing values. Instead, we will fill missing values with the most frequent value in each column

idx, value = extract_idx_value(ratio_nan_per_col)
for col, ratio in zip(idx, value):
    if ratio < threshold:
        df_test[col].fillna(df_test[col].mode()[0], inplace=True)

'Legal_document' column¶

In [24]:
df_train['Legal_document'].isna().sum()
Out[24]:
23026
In [25]:
# Because the 'Legal_document' column has a lot of missing values, we will try to fill them with 'Unknown' value

df_train['Legal_document'].fillna('Unknown', inplace=True)
df_test['Legal_document'].fillna('Unknown', inplace=True)

'Number_of_floors' column¶

In [26]:
df_train['Number_of_floors'].isna().sum()
Out[26]:
36790
In [27]:
# Because the 'Number_of_floors' column has a lot of missing values, we will try to fill them with 'Unknown' value

df_train['Number_of_floors'].fillna('Unknown', inplace=True)
df_test['Number_of_floors'].fillna('Unknown', inplace=True)

'Length', 'Width' columns¶

In [28]:
# We don't need the 'Length', 'Width' columns because we already have the 'Area' column

df_train.drop(['Length', 'Width'], axis=1, inplace=True)
df_test.drop(['Length', 'Width'], axis=1, inplace=True)

'Address' column¶

In [29]:
# We don't need the 'Address' column because we already have the 'District' and 'Ward' columns

df_train.drop(['Address'], axis=1, inplace=True)
df_test.drop(['Address'], axis=1, inplace=True)

Dataset don't have any missing value

In [30]:
print('Number of missing values in train set:')
df_train.isna().sum()
Number of missing values in train set:
Out[30]:
Date                  0
District              0
Ward                  0
Type_of_housing       0
Legal_document        0
Number_of_floors      0
Number_of_bedrooms    0
Area                  0
Price_per_m2          0
dtype: int64
In [31]:
print('Number of missing values in test set:')
df_test.isna().sum()
Number of missing values in test set:
Out[31]:
Date                  0
District              0
Ward                  0
Type_of_housing       0
Legal_document        0
Number_of_floors      0
Number_of_bedrooms    0
Area                  0
Price_per_m2          0
dtype: int64
In [32]:
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
In [33]:
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65883 entries, 0 to 65882
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Date                65883 non-null  object
 1   District            65883 non-null  object
 2   Ward                65883 non-null  object
 3   Type_of_housing     65883 non-null  object
 4   Legal_document      65883 non-null  object
 5   Number_of_floors    65883 non-null  object
 6   Number_of_bedrooms  65883 non-null  object
 7   Area                65883 non-null  object
 8   Price_per_m2        65883 non-null  object
dtypes: object(9)
memory usage: 4.5+ MB
In [34]:
df_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16497 entries, 0 to 16496
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Date                16497 non-null  object
 1   District            16497 non-null  object
 2   Ward                16497 non-null  object
 3   Type_of_housing     16497 non-null  object
 4   Legal_document      16497 non-null  object
 5   Number_of_floors    16497 non-null  object
 6   Number_of_bedrooms  16497 non-null  object
 7   Area                16497 non-null  object
 8   Price_per_m2        16497 non-null  object
dtypes: object(9)
memory usage: 1.1+ MB

Process data types¶

In [35]:
# Convert 'Date' column to datetime type

df_train['Date'] = pd.to_datetime(df_train['Date'])
df_test['Date'] = pd.to_datetime(df_test['Date'])
In [36]:
df_train['Date'].max()
Out[36]:
Timestamp('2020-08-05 00:00:00')
In [37]:
df_train['Date'].min()
Out[37]:
Timestamp('2019-08-05 00:00:00')

'District' column¶

In [38]:
df_train['District'].unique()
Out[38]:
array(['Quận Hoàng Mai', 'Quận Hà Đông', 'Quận Hai Bà Trưng',
       'Quận Đống Đa', 'Quận Cầu Giấy', 'Quận Ba Đình',
       'Quận Nam Từ Liêm', 'Huyện Thanh Trì', 'Quận Tây Hồ',
       'Quận Thanh Xuân', 'Quận Long Biên', 'Huyện Gia Lâm',
       'Quận Bắc Từ Liêm', 'Huyện Quốc Oai', 'Huyện Hoài Đức',
       'Quận Hoàn Kiếm', 'Huyện Đông Anh', 'Huyện Thanh Oai',
       'Thị xã Sơn Tây', 'Huyện Sóc Sơn', 'Huyện Đan Phượng',
       'Huyện Thường Tín', 'Huyện Chương Mỹ', 'Huyện Phúc Thọ',
       'Huyện Mê Linh', 'Huyện Thạch Thất', 'Huyện Ba Vì',
       'Huyện Phú Xuyên'], dtype=object)
In [39]:
# Replace typos in 'District' column if greater than 98% match

for quan in df_train['District'].unique():
    replace_matches_in_column(df=df_train, column='District', string_to_match=quan, min_ratio=98)
    replace_matches_in_column(df=df_test, column='District', string_to_match=quan, min_ratio=98)

print('Done!')
Done!

Normial category

Convert datatype of 'District' column to 'category'

Nếu các giá trị của 1 cột là hữu hạn, ta nên đưa nó về category, nó sẽ giúp tiết kiệm bộ nhớ, cũng như có thể sử dụng các phương thức khác, mà kiểu object không thể, mặc dù cả hai đều chứa string.
Trong trường hợp, cột đó chứa feedback, hoặc description của sản phẩm, thì ta sẽ để nó ở kiểu object.

In [40]:
before = df_train['District'].memory_usage()  # memory usage of column in bytes
print(f'Before convert to category: {before * 1e-3} kb')
Before convert to category: 527.192 kb
In [41]:
df_train['District'] = df_train['District'].astype('category')

# To make sure the test set has the same categories as the training set, we will use the CategoricalDtype function
df_test['District'] = df_test['District'].astype(CategoricalDtype(categories=df_train['District'].cat.categories))

after = df_train['District'].memory_usage()
print(f'After convert to category: {after * 1e-3} kb')

result = (before - after) / before
print(f"After convert to category, the memory usage of the 'District' column is reduced by: {round(result * 100, 2)}%")
After convert to category: 67.307 kb
After convert to category, the memory usage of the 'District' column is reduced by: 87.23%

After convert to category, we can encode, sort or compare value

In [42]:
df_train['District'].cat.categories
Out[42]:
Index(['Huyện Ba Vì', 'Huyện Chương Mỹ', 'Huyện Gia Lâm', 'Huyện Hoài Đức',
       'Huyện Mê Linh', 'Huyện Phú Xuyên', 'Huyện Phúc Thọ', 'Huyện Quốc Oai',
       'Huyện Sóc Sơn', 'Huyện Thanh Oai', 'Huyện Thanh Trì',
       'Huyện Thường Tín', 'Huyện Thạch Thất', 'Huyện Đan Phượng',
       'Huyện Đông Anh', 'Quận Ba Đình', 'Quận Bắc Từ Liêm', 'Quận Cầu Giấy',
       'Quận Hai Bà Trưng', 'Quận Hoàn Kiếm', 'Quận Hoàng Mai', 'Quận Hà Đông',
       'Quận Long Biên', 'Quận Nam Từ Liêm', 'Quận Thanh Xuân', 'Quận Tây Hồ',
       'Quận Đống Đa', 'Thị xã Sơn Tây'],
      dtype='object')
In [43]:
df_train['District'].head()
Out[43]:
0       Quận Hoàng Mai
1         Quận Hà Đông
2         Quận Hà Đông
3    Quận Hai Bà Trưng
4         Quận Đống Đa
Name: District, dtype: category
Categories (28, object): ['Huyện Ba Vì', 'Huyện Chương Mỹ', 'Huyện Gia Lâm', 'Huyện Hoài Đức', ..., 'Quận Thanh Xuân', 'Quận Tây Hồ', 'Quận Đống Đa', 'Thị xã Sơn Tây']
In [44]:
df_train['District'].cat.codes[:5]
Out[44]:
0    20
1    21
2    21
3    18
4    26
dtype: int8
In [45]:
sns.set_style('whitegrid')
plt.figure(figsize=(20, 10))
sns.countplot(data=df_train, y='District', palette='magma')
plt.title('Number of houses to sale in each district')
plt.xlabel('Number of houses to sale')
plt.ylabel('District')
plt.show()

'Legal_documents' column¶

Ordinal Category

Convert data type to Ordinal category similar to Norminal category, but we have to specify the order of the categories.
Example: 'thấp' < 'trung bình' < 'cao'

In [46]:
df_train['Legal_document'].unique()
Out[46]:
array(['Đã có sổ', 'Unknown', 'Đang chờ sổ', 'Giấy tờ khác'], dtype=object)

We can assume order of the legal documents from the worst to the best is: Unknown < Giấy tờ khác < Đang chờ sổ < Đã có sổ

In [47]:
# Before convert to orinal category

df_train[df_train['Legal_document'] > 'Giấy tờ khác']['Legal_document'].unique()
Out[47]:
array(['Đã có sổ', 'Unknown', 'Đang chờ sổ'], dtype=object)
In [48]:
# After convert ordinal category

order_of_legal_document = ['Unknown', 'Giấy tờ khác', 'Đang chờ sổ', 'Đã có sổ']
df_train['Legal_document'] = df_train['Legal_document'].astype(CategoricalDtype(categories=order_of_legal_document, ordered=True))
df_test['Legal_document'] = df_test['Legal_document'].astype(CategoricalDtype(categories=order_of_legal_document, ordered=True))
In [49]:
# Filter rows with 'Legal_document' > 'Giấy tờ khác'

df_train[df_train['Legal_document'] > 'Giấy tờ khác']['Legal_document'].unique()
Out[49]:
['Đã có sổ', 'Đang chờ sổ']
Categories (4, object): ['Unknown' < 'Giấy tờ khác' < 'Đang chờ sổ' < 'Đã có sổ']
In [50]:
df_train['Legal_document'].head()
Out[50]:
0    Đã có sổ
1    Đã có sổ
2    Đã có sổ
3    Đã có sổ
4    Đã có sổ
Name: Legal_document, dtype: category
Categories (4, object): ['Unknown' < 'Giấy tờ khác' < 'Đang chờ sổ' < 'Đã có sổ']
In [51]:
plt.figure(figsize=(12, 8))
sns.countplot(data=df_train, x='Legal_document', palette='magma')
plt.title('Number of houses to sale in each legal document category')
plt.xlabel('Legal document category')
plt.ylabel('Number of houses to sale')
plt.show()

'Type_of_housing' column¶

In [52]:
df_train['Type_of_housing'].unique()
Out[52]:
array(['Nhà ngõ, hẻm', 'Nhà mặt phố, mặt tiền', 'Nhà biệt thự',
       'Nhà phố liền kề'], dtype=object)
In [53]:
order_of_type_of_housing = ['Nhà ngõ, hẻm', 'Nhà mặt phố, mặt tiền', 'Nhà phố liền kề', 'Nhà biệt thự']
df_train['Type_of_housing'] = df_train['Type_of_housing'].astype(CategoricalDtype(categories=order_of_type_of_housing, ordered=True))
df_test['Type_of_housing'] = df_test['Type_of_housing'].astype(CategoricalDtype(categories=order_of_type_of_housing, ordered=True))
In [54]:
df_train['Type_of_housing'].head()
Out[54]:
0             Nhà ngõ, hẻm
1             Nhà ngõ, hẻm
2             Nhà ngõ, hẻm
3             Nhà ngõ, hẻm
4    Nhà mặt phố, mặt tiền
Name: Type_of_housing, dtype: category
Categories (4, object): ['Nhà ngõ, hẻm' < 'Nhà mặt phố, mặt tiền' < 'Nhà phố liền kề' < 'Nhà biệt thự']
In [55]:
plt.figure(figsize=(12, 8))
sns.countplot(data=df_train, x='Type_of_housing', palette='magma')
plt.title('Number of houses to sale in each type of housing')
plt.xlabel('Type of housing')
plt.ylabel('Number of houses to sale')
plt.show()
In [56]:
g = sns.catplot(data=df_train, x='Type_of_housing', col='Legal_document', kind='count', col_wrap=2, palette='magma')
g.set_axis_labels('Type of housing', 'Number of houses to sale')
g.fig.suptitle('Number of houses to sale in each type of housing and legal document category', y=1.05)
g.set_titles('{col_name}')
g.tick_params(axis='x', rotation=45)
plt.show()

'Ward' column¶

In [57]:
df_train['Ward'].unique()
Out[57]:
array(['Phường Tương Mai', 'Phường Yên Nghĩa', 'Phường Vạn Phúc',
       'Phường Thanh Nhàn', 'Phường Thịnh Quang', 'Phường Trung Liệt',
       'Phường Đồng Tâm', 'Phường Quan Hoa', 'Phường Đống Mác',
       'Phường Liễu Giai', 'Phường Phú Đô', 'Xã Tân Triều',
       'Phường Phú Lương', 'Phường Hoàng Văn Thụ', 'Phường Thụy Khuê',
       'Phường Trương Định', 'Phường Khương Đình', 'Phường Đại Kim',
       'Phường Phương Canh', 'Phường Trung Văn', 'Phường Ô Chợ Dừa',
       'Phường Đức Giang', 'Phường Cầu Dền', 'Phường Thạch Bàn',
       'Phường Bưởi', 'Phường Thanh Lương', 'Phường Ngọc Lâm',
       'Xã Đông Dư', 'Phường Láng Hạ', 'Phường Văn Quán',
       'Phường Nhân Chính', 'Phường La Khê', 'Phường Dịch Vọng',
       'Phường Khương Mai', 'Phường Điện Biên', 'Phường Vĩnh Phúc',
       'Phường Yên Hoà', 'Phường Cổ Nhuế 1', 'Phường Kiến Hưng',
       'Phường Bạch Đằng', 'Phường Định Công', 'Phường Nam Đồng',
       'Phường Ngọc Khánh', 'Phường Yên Sở', 'Phường Nghĩa Đô',
       'Phường Xuân Đỉnh', 'Phường Mai Dịch', 'Phường Thịnh Liệt',
       'Phường Lĩnh Nam', 'Phường Trung Phụng', 'Phường Quang Trung',
       'Phường Kim Giang', 'Phường Trúc Bạch', 'Phường Quốc Tử Giám',
       'Phường Thượng Đình', 'Phường Minh Khai', 'Phường Đội Cấn',
       'Phường Vĩnh Tuy', 'Phường Thanh Xuân Trung', 'Phường Giáp Bát',
       'Phường Bồ Đề', 'Phường Phú La', 'Phường Khâm Thiên',
       'Phường Tây Mỗ', 'Phường Nhật Tân', 'Phường Thanh Xuân Bắc',
       'Phường Dịch Vọng Hậu', 'Phường Tân Mai', 'Phường Hàng Bột',
       'Phường Kim Liên', 'Phường Bạch Mai', 'Phường Thổ Quan',
       'Phường Ngọc Thụy', 'Phường Long Biên', 'Phường Quỳnh Mai',
       'Phường Cống Vị', 'Phường Thượng Thanh', 'Phường Thanh Xuân Nam',
       'Phường Khương Trung', 'Phường Trung Hoà', 'Phường Khương Thượng',
       'Phường Kim Mã', 'Phường Mỹ Đình 1', 'Xã Thanh Liệt',
       'Phường Ngã Tư Sở', 'Xã Đồng Quang', 'Phường Thành Công',
       'Phường Phú Lãm', 'Phường Nguyễn Trãi', 'Phường Gia Thụy',
       'Phường Giang Biên', 'Xã Vân Canh', 'Phường Cầu Diễn',
       'Phường Xuân Phương', 'Phường Láng Thượng', 'Xã Tam Hiệp',
       'Xã Tứ Hiệp', 'Xã Hữu Hoà', 'Phường Phúc Lợi', 'Phường Đồng Nhân',
       'Phường Phố Huế', 'Phường Bách Khoa', 'Thị trấn Trâu Quỳ',
       'Phường Vĩnh Hưng', 'Phường Văn Chương', 'Phường Mộ Lao',
       'Phường Quỳnh Lôi', 'Phường Quảng An', 'Phường Xuân La',
       'Phường Phú Diễn', 'Phường Đại Mỗ', 'Phường Cửa Đông',
       'Phường Mỹ Đình 2', 'Xã Bắc Hồng', 'Xã An Thượng',
       'Phường Thanh Trì', 'Xã Ngọc Hồi', 'Phường Cát Linh',
       'Phường Ngọc Hà', 'Phường Hoàng Liệt', 'Phường Hà Cầu',
       'Phường Lê Đại Hành', 'Phường Phương Liên', 'Phường Mai Động',
       'Phường Yết Kiêu', 'Phường Phúc Đồng', 'Xã Tả Thanh Oai',
       'Phường Giảng Võ', 'Phường Sài Đồng', 'Phường Tứ Liên',
       'Phường Hàng Bồ', 'Phường Nghĩa Tân', 'Xã La Phù',
       'Phường Cổ Nhuế 2', 'Phường Đông Ngạc', 'Phường Cự Khối',
       'Phường Phú Thượng', 'Phường Yên Phụ', 'Phường Phương Liệt',
       'Xã Đặng Xá', 'Phường Văn Miếu', 'Phường Biên Giang',
       'Phường Phạm Đình Hổ', 'Phường Phương Mai', 'Phường Trung Tự',
       'Thị trấn Đông Anh', 'Phường Trần Phú', 'Phường Phúc La',
       'Xã Ngũ Hiệp', 'Phường Đồng Mai', 'Thị trấn Văn Điển',
       'Phường Phúc Tân', 'Phường Bùi Thị Xuân', 'Thị trấn Yên Viên',
       'Phường Mễ Trì', 'Phường Phúc Xá', 'Phường Việt Hưng',
       'Phường Chương Dương', 'Phường Hạ Đình', 'Xã Kim Chung',
       'Phường Hàng Bài', 'Phường Dương Nội', 'Xã Cự Khê',
       'Phường Xuân Tảo', 'Xã Cổ Đông', 'Phường Lý Thái Tổ',
       'Phường Phan Chu Trinh', 'Xã Đông La', 'Phường Ngô Thì Nhậm',
       'Xã Đức Giang', 'Xã Võng La', 'Phường Thụy Phương', 'Xã Yên Viên',
       'Phường Tràng Tiền', 'Xã Vạn Phúc', 'Phường Hàng Buồm',
       'Phường Trần Hưng Đạo', 'Xã Mai Lâm', 'Phường Cửa Nam',
       'Xã Nam Hồng', 'Xã Phú Cường', 'Thị trấn Trạm Trôi',
       'Phường Tây Tựu', 'Xã Liên Ninh', 'Phường Nguyễn Du',
       'Xã Di Trạch', 'Xã Đông Mỹ', 'Phường Phúc Diễn', 'Xã An Khánh',
       'Thị trấn Phùng', 'Phường Đồng Xuân', 'Xã Minh Phú',
       'Phường Phú Thịnh', 'Phường Hàng Gai', 'Phường Hàng Bạc',
       'Xã Bát Tràng', 'Xã Vĩnh Ngọc', 'Xã Kiêu Kỵ', 'Xã Vĩnh Quỳnh',
       'Phường Đức Thắng', 'Phường Hàng Bông', 'Phường Quán Thánh',
       'Xã Tiên Dược', 'Phường Nguyễn Trung Trực', 'Xã Cổ Bi',
       'Xã Duyên Thái', 'Thị trấn Chúc Sơn', 'Xã Khánh Hà',
       'Phường Ngô Quyền', 'Xã Tân Hội', 'Xã Dương Quang',
       'Xã Quang Tiến', 'Xã Phù Đổng', 'Xã Mai Đình', 'Xã Yên Thường',
       'Xã Ngọc Tảo', 'Phường Hàng Mã', 'Xã Vân Côn', 'Xã Đại áng',
       'Xã Hà Hồi', 'Xã Tân Lập', 'Xã Uy Nỗ', 'Xã Đông Yên',
       'Thị trấn Xuân Mai', 'Xã Bích Hòa', 'Thị trấn Quang Minh',
       'Xã Bình Phú', 'Xã Đa Tốn', 'Xã Phú Châu', 'Thị trấn Thường Tín',
       'Xã Phù Linh', 'Phường Hàng Trống', 'Phường Liên Mạc',
       'Phường Thượng Cát', 'Xã Sơn Đông', 'Xã Tam Đồng', 'Xã Sơn Đồng',
       'Xã Phương Đình', 'Xã Phù Lỗ', 'Phường Hàng Đào',
       'Thị trấn Quốc Oai', 'Xã Phú Cát', 'Xã Việt Hùng', 'Xã Phụng Châu',
       'Xã Nhị Khê', 'Xã Bình Yên', 'Xã Võng Xuyên', 'Xã Đại Thịnh',
       'Xã Hòa Thạch', 'Xã Vân Nội', 'Xã Kim Sơn', 'Xã Vân Hòa',
       'Xã Hoàng Văn Thụ', 'Xã Sài Sơn', 'Xã Thủy Xuân Tiên',
       'Xã Đức Thượng', 'Xã Xuân Nộn', 'Xã Đông Hội', 'Phường Xuân Khanh',
       'Xã Thạch Hoà', 'Xã Ninh Sở', 'Xã Kim Hoa', 'Xã Vân Tảo',
       'Xã Dương Liễu', 'Xã Thanh Cao', 'Xã Dục Tú', 'Xã Tiên Dương',
       'Xã Lê Lợi', 'Xã Hải Bối', 'Xã Tiền Phong', 'Thị trấn Sóc Sơn',
       'Phường Trung Sơn Trầm', 'Xã Ngọc Liệp', 'Xã Dương Xá',
       'Xã Phú Sơn', 'Xã Đỗ Động', 'Thị trấn Liên Quan', 'Xã Nguyên Khê',
       'Thị trấn Phú Xuyên', 'Thị trấn Kim Bài', 'Xã Mê Linh',
       'Xã Minh Khai', 'Xã Đại Thành', 'Xã Song Phương', 'Xã Duyên Hà',
       'Xã Đại Yên', 'Xã Tiến Xuân', 'Xã Minh Trí', 'Xã Xuân Giang',
       'Xã Ninh Hiệp', 'Xã Phú Minh', 'Xã Phương Trung', 'Xã Nghĩa Hương'],
      dtype=object)
In [58]:
df_train['Ward'] = df_train['Ward'].astype('category')
df_test['Ward'] = df_test['Ward'].astype(CategoricalDtype(categories=df_train['Ward'].cat.categories))

'Number_of_floors' column¶

In [59]:
plt.figure(figsize=(20, 10))
sns.countplot(data=df_train, x='Number_of_floors', palette='magma')
plt.title('Number of houses to sale in each number of floors')
plt.xlabel('Number of floors')
plt.ylabel('Number of houses to sale')
plt.show()

Number or category greater than 10 is too few, we will group them into '10' category

In [60]:
def process_so_tang(x):
    try:
        if x == 'Nhiều hơn 10' or int(x) > 10:
            return '10'
        return x
    except:
        return x
In [61]:
df_train['Number_of_floors'] = df_train['Number_of_floors'].apply(process_so_tang)
df_test['Number_of_floors'] = df_test['Number_of_floors'].apply(process_so_tang)
In [62]:
df_train['Number_of_floors'].value_counts()
Out[62]:
Number_of_floors
Unknown    36790
5          12637
4           9831
3           2889
6           1681
2            811
1            506
7            465
8            147
9             70
10            56
Name: count, dtype: int64
In [63]:
# Convert 'Number_of_floors' column to ordinal category

order_of_number_of_floors = ['Unknown', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
df_train['Number_of_floors'] = df_train['Number_of_floors'].astype(CategoricalDtype(categories=order_of_number_of_floors, ordered=True))
df_test['Number_of_floors'] = df_test['Number_of_floors'].astype(CategoricalDtype(categories=order_of_number_of_floors, ordered=True))
In [64]:
df_train['Number_of_floors'].head()
Out[64]:
0    Unknown
1    Unknown
2          3
3    Unknown
4    Unknown
Name: Number_of_floors, dtype: category
Categories (11, object): ['Unknown' < '1' < '2' < '3' ... '7' < '8' < '9' < '10']
In [65]:
plt.figure(figsize=(12, 8))
sns.countplot(data=df_train, x='Number_of_floors', palette='magma')
plt.title('Number of houses to sale in each number of floors')
plt.xlabel('Number of floors')
plt.ylabel('Number of houses to sale')
plt.show()
In [66]:
g = sns.catplot(data=df_train, x='Number_of_floors', col='Legal_document', kind='count', col_wrap=2, palette='magma')
g.fig.suptitle('Number of houses to sale in each number of floors and legal document category', y=1.05)
g.set_axis_labels('Number of floors', 'Number of houses to sale')
plt.show()

'Number_of_bedrooms' column¶

In [67]:
df_train['Number_of_bedrooms'].unique()
Out[67]:
array(['2 phòng', '4 phòng', '6 phòng', '3 phòng', '8 phòng', '5 phòng',
       '1 phòng', '7 phòng', 'nhiều hơn 10 phòng', '10 phòng', '9 phòng'],
      dtype=object)
In [68]:
# Replace 'nhiều hơn 10 phòng' with '10 phòng'

df_train['Number_of_bedrooms'].replace('nhiều hơn 10 phòng', '10 phòng', inplace=True)
df_test['Number_of_bedrooms'].replace('nhiều hơn 10 phòng', '10 phòng', inplace=True)
In [69]:
# Convert 'Number_of_bedrooms' column to numeric type

df_train['Number_of_bedrooms'] = df_train['Number_of_bedrooms'].str.replace('phòng', '')
df_train['Number_of_bedrooms'] = df_train['Number_of_bedrooms'].astype('int')

df_test['Number_of_bedrooms'] = df_test['Number_of_bedrooms'].str.replace('phòng', '')
df_test['Number_of_bedrooms'] = df_test['Number_of_bedrooms'].astype('int')
In [70]:
plt.figure(figsize=(12, 8))
sns.countplot(data=df_train, x='Number_of_bedrooms', palette='magma')
plt.title('Number of houses to sale in each number of bedrooms')
plt.xlabel('Number of bedrooms')
plt.ylabel('Number of houses to sale')
plt.show()

'Area' column¶

In [71]:
# Remove 'm²' and replace ',' with '.' to convert data type to float

df_train['Area'] = df_train['Area'].apply(lambda x: x.replace('m²', '').replace(',', '.')).astype(float)
df_test['Area'] = df_test['Area'].apply(lambda x: x.replace('m²', '').replace(',', '.')).astype(float)

'Price_per_m2' column¶

In [72]:
df_train['Price_per_m2'].value_counts()
Out[72]:
Price_per_m2
100 triệu/m²       2014
80 triệu/m²         909
75 triệu/m²         763
90 triệu/m²         674
83,33 triệu/m²      647
                   ... 
163,83 triệu/m²       1
21,62 triệu/m²        1
15,28 triệu/m²        1
68.292 đ/m²           1
36,89 triệu/m²        1
Name: count, Length: 7553, dtype: int64

Price_per_m2 column has string values with price + unit, we will move the unit to the Unit column

In [73]:
print('Units:')
units = df_train['Price_per_m2'].apply(lambda x: x.split(' ')[1])
units.value_counts()
Units:
Out[73]:
Price_per_m2
triệu/m²    65209
đ/m²          587
tỷ/m²          87
Name: count, dtype: int64
In [74]:
df_train[units == 'đ/m²']
Out[74]:
Date District Ward Type_of_housing Legal_document Number_of_floors Number_of_bedrooms Area Price_per_m2
67 2020-07-13 Quận Hoàng Mai Phường Thịnh Liệt Nhà ngõ, hẻm Unknown Unknown 3 33.000000000 80.303 đ/m²
205 2020-07-13 Quận Nam Từ Liêm Phường Xuân Phương Nhà biệt thự Unknown Unknown 5 200.000000000 29.500 đ/m²
288 2020-07-20 Quận Hà Đông Phường Văn Quán Nhà ngõ, hẻm Đã có sổ 5 5 37.000000000 112.162 đ/m²
299 2020-07-22 Quận Thanh Xuân Phường Thanh Xuân Trung Nhà mặt phố, mặt tiền Unknown Unknown 4 40.000000000 100.000 đ/m²
353 2020-07-25 Quận Cầu Giấy Phường Mai Dịch Nhà ngõ, hẻm Unknown Unknown 4 61.000000000 73.770 đ/m²
... ... ... ... ... ... ... ... ... ...
65522 2020-06-27 Quận Thanh Xuân Phường Khương Mai Nhà ngõ, hẻm Đã có sổ 5 4 31.000000000 90.000 đ/m²
65606 2020-07-03 Quận Nam Từ Liêm Phường Phú Đô Nhà ngõ, hẻm Unknown Unknown 3 31.000000000 70.967 đ/m²
65615 2020-06-08 Quận Hà Đông Phường Phúc La Nhà phố liền kề Đã có sổ 4 4 50.000000000 97.000 đ/m²
65734 2020-06-18 Quận Cầu Giấy Phường Mai Dịch Nhà mặt phố, mặt tiền Đã có sổ 4 5 44.000000000 173.863 đ/m²
65790 2020-07-11 Quận Nam Từ Liêm Phường Mỹ Đình 2 Nhà mặt phố, mặt tiền Đã có sổ Unknown 6 75.000000000 73.333 đ/m²

587 rows × 9 columns

'đ/m²' may be a typo of 'triệu/m²'

In [75]:
df_train[units == 'tỷ/m²']
Out[75]:
Date District Ward Type_of_housing Legal_document Number_of_floors Number_of_bedrooms Area Price_per_m2
1832 2020-06-08 Quận Hoàn Kiếm Phường Lý Thái Tổ Nhà mặt phố, mặt tiền Unknown Unknown 4 100.000000000 1,5 tỷ/m²
2050 2020-08-03 Quận Hoàn Kiếm Phường Hàng Bồ Nhà mặt phố, mặt tiền Đã có sổ 8 10 126.000000000 1,111111111 tỷ/m²
4118 2020-07-20 Quận Thanh Xuân Phường Thượng Đình Nhà ngõ, hẻm Đã có sổ Unknown 3 25.000000000 1,14 tỷ/m²
4688 2020-07-17 Quận Đống Đa Phường Ô Chợ Dừa Nhà ngõ, hẻm Đã có sổ Unknown 2 2.000000000 1,25 tỷ/m²
5082 2020-06-30 Quận Hoàng Mai Phường Mai Động Nhà mặt phố, mặt tiền Đã có sổ 6 10 49.000000000 1,122448979 tỷ/m²
... ... ... ... ... ... ... ... ... ...
59380 2020-07-19 Quận Nam Từ Liêm Phường Mễ Trì Nhà mặt phố, mặt tiền Đã có sổ 4 3 100.000000000 8,9 tỷ/m²
61093 2020-08-04 Quận Hà Đông Phường La Khê Nhà phố liền kề Unknown Unknown 6 5.000000000 1,96 tỷ/m²
61490 2020-07-31 Quận Long Biên Phường Long Biên Nhà ngõ, hẻm Đã có sổ Unknown 4 2.000000000 2,65 tỷ/m²
62002 2020-07-31 Quận Nam Từ Liêm Phường Phú Đô Nhà ngõ, hẻm Đã có sổ 5 3 30.000000000 1,133333333 tỷ/m²
64910 2020-06-11 Quận Đống Đa Phường Hàng Bột Nhà ngõ, hẻm Đã có sổ 4 4 65.000000000 7,692307692 tỷ/m²

87 rows × 9 columns

In [76]:
df_train['Unit'] = df_train['Price_per_m2'].apply(lambda x: x.split(' ')[1])
df_train['Price_per_m2'] = df_train['Price_per_m2'].apply(lambda x: x.split(' ')[0])

df_test['Unit'] = df_test['Price_per_m2'].apply(lambda x: x.split(' ')[1])
df_test['Price_per_m2'] = df_test['Price_per_m2'].apply(lambda x: x.split(' ')[0])
In [77]:
# Check if there is any value that cannot be converted to float

idx, values = extract_idx_value(df_train['Price_per_m2'].apply(lambda x: x.replace(',', '.')))

for _idx, value in zip(idx, values):
    try:
        float(value)
    except:
        print(_idx, value)
26102 2.222.22220022
46051 728.000.00728
In [78]:
idx, values = extract_idx_value(df_test['Price_per_m2'].apply(lambda x: x.replace(',', '.')))

for _idx, value in zip(idx, values):
    try:
        float(value)
    except:
        print(_idx, value)
In [79]:
# Remove row with index 26102 and 46051 because they cannot be converted to float

df_train.drop([26102, 46051], inplace=True)
df_train['Price_per_m2'] = df_train['Price_per_m2'].apply(lambda x: x.replace(',', '.')).astype(float)
df_test['Price_per_m2'] = df_test['Price_per_m2'].apply(lambda x: x.replace(',', '.')).astype(float)
In [80]:
df_train.reset_index(drop=True, inplace=True)
In [81]:
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65881 entries, 0 to 65880
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                65881 non-null  datetime64[ns]
 1   District            65881 non-null  category      
 2   Ward                65881 non-null  category      
 3   Type_of_housing     65881 non-null  category      
 4   Legal_document      65881 non-null  category      
 5   Number_of_floors    65881 non-null  category      
 6   Number_of_bedrooms  65881 non-null  int32         
 7   Area                65881 non-null  float64       
 8   Price_per_m2        65881 non-null  float64       
 9   Unit                65881 non-null  object        
dtypes: category(5), datetime64[ns](1), float64(2), int32(1), object(1)
memory usage: 2.7+ MB

Dataset after process data types

In [82]:
df_train.describe()
Out[82]:
Date Number_of_bedrooms Area Price_per_m2
count 65881 65881.000000000 65881.000000000 65881.000000000
mean 2020-07-07 17:00:00.382507776 3.874667962 52.165548793 102.958278322
min 2019-08-05 00:00:00 1.000000000 1.000000000 1.000000000
25% 2020-06-23 00:00:00 3.000000000 34.000000000 73.250000000
50% 2020-07-09 00:00:00 4.000000000 40.000000000 90.000000000
75% 2020-07-24 00:00:00 4.000000000 50.000000000 110.710000000
max 2020-08-05 00:00:00 10.000000000 111411.000000000 998.000000000
std NaN 1.454825385 525.347406958 65.998854713

Feature Engineering¶

In [83]:
# Create a new column 'Price' = 'Price_per_m2' * 'Area' (bilion VND), if 'Unit' != 'tỷ/m²' then 'Price' /= 1000

for row, col in df_train.iterrows():
    df_train.loc[row, 'Price'] = col['Price_per_m2'] * col['Area']
    if col['Unit'] != 'tỷ/m²':
        df_train.loc[row, 'Price'] /= 1000
In [84]:
for row, col in df_test.iterrows():
    df_test.loc[row, 'Price'] = col['Price_per_m2'] * col['Area']
    if col['Unit'] != 'tỷ/m²':
        df_test.loc[row, 'Price'] /= 1000
In [85]:
# Create a new column 'Inner_city' if 'District' have 'Quận'

df_train['Inner_city'] = df_train['District'].apply(lambda x: 1 if 'Quận' in x else 0)
df_test['Inner_city'] = df_test['District'].apply(lambda x: 1 if 'Quận' in x else 0)
In [86]:
# Create a new column 'Year' and 'Month' from 'Date' column

df_train['Year'] = df_train['Date'].dt.year
df_train['Month'] = df_train['Date'].dt.month

df_test['Year'] = df_test['Date'].dt.year
df_test['Month'] = df_test['Date'].dt.month
In [87]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_train, x='Price', color='#F38181')
plt.title('Boxplot of price')
plt.xlabel('Price')
plt.show()

Train set have a lot of extreme values, we will remove them to easy visualize and make the predict model more robust

In [88]:
# Remove outliers in 'Price' column

iqr = df_train['Price'].quantile(0.75) - df_train['Price'].quantile(0.25)
upper_bound = df_train['Price'].quantile(0.75) + 1.5 * iqr
lower_bound = df_train['Price'].quantile(0.25) - 1.5 * iqr

df_train_no_outlier = df_train[(df_train['Price'] < upper_bound) & (df_train['Price'] > lower_bound)]
In [89]:
print(f'Số lượng outlier: {len(df_train) - len(df_train_no_outlier)}')
Số lượng outlier: 6280
In [90]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_train_no_outlier, x='Price', color='#F38181')
plt.title('Boxplot of price (no outlier)')
plt.xlabel('Price (bilion VND)')
plt.show()
In [91]:
df_train_no_outlier.describe()
Out[91]:
Date Number_of_bedrooms Area Price_per_m2 Price Inner_city Year Month
count 59601 59601.000000000 59601.000000000 59601.000000000 59601.000000000 59601.000000000 59601.000000000 59601.000000000
mean 2020-07-07 20:59:54.382644480 3.736799718 42.960118454 90.270592893 3.709747373 0.972215231 2019.999781883 6.724987836
min 2019-08-05 00:00:00 1.000000000 1.000000000 1.000000000 0.026000000 0.000000000 2019.000000000 2.000000000
25% 2020-06-23 00:00:00 3.000000000 33.000000000 71.670000000 2.699900000 1.000000000 2020.000000000 6.000000000
50% 2020-07-09 00:00:00 4.000000000 40.000000000 87.100000000 3.399990000 1.000000000 2020.000000000 7.000000000
75% 2020-07-24 00:00:00 4.000000000 48.000000000 103.330000000 4.500000000 1.000000000 2020.000000000 7.000000000
max 2020-08-05 00:00:00 10.000000000 4207.000000000 998.000000000 8.499900000 1.000000000 2020.000000000 12.000000000
std NaN 1.278331135 37.384676264 34.460070071 1.517406929 0.164357016 0.014767303 0.638387172
In [103]:
plt.figure(figsize=(12, 8))
sns.histplot(data=df_train_no_outlier, x='Price', bins=50, color='#F38181')
plt.title('Phân phối giá nhà')
plt.xlabel('Giá nhà (tỷ)')
plt.ylabel('Số lượng')
plt.show()
In [93]:
plt.figure(figsize=(12, 8))
sns.countplot(data=df_train_no_outlier, y='District', hue='Inner_city', order=df_train_no_outlier['District'].value_counts().index, palette='magma')
plt.title('Number of houses to sale in each district')
plt.xlabel('Number of houses to sale')
plt.ylabel('District')
plt.show()
In [94]:
df_train_no_outlier.groupby('District')['Price'].mean().sort_values(ascending=False)
Out[94]:
District
Huyện Thạch Thất    5.092731111
Quận Cầu Giấy       4.560376670
Huyện Ba Vì         4.550900000
Quận Tây Hồ         4.298711457
Huyện Quốc Oai      4.157169286
Quận Ba Đình        4.005533481
Quận Đống Đa        3.914877781
Quận Thanh Xuân     3.891731290
Quận Hoàn Kiếm      3.854751580
Quận Nam Từ Liêm    3.760277353
Quận Long Biên      3.632982005
Quận Hai Bà Trưng   3.604108911
Huyện Đan Phượng    3.568718000
Quận Bắc Từ Liêm    3.535226007
Huyện Gia Lâm       3.339598590
Quận Hoàng Mai      3.261049142
Quận Hà Đông        3.249754416
Huyện Mê Linh       3.238737143
Huyện Thanh Trì     2.686734193
Huyện Chương Mỹ     2.601325625
Huyện Đông Anh      2.499684232
Huyện Sóc Sơn       2.406941212
Thị xã Sơn Tây      2.387376923
Huyện Phúc Thọ      2.312555000
Huyện Thường Tín    2.163920000
Huyện Hoài Đức      2.026118674
Huyện Thanh Oai     1.592710000
Huyện Phú Xuyên             NaN
Name: Price, dtype: float64
In [95]:
mean_price_per_district = df_train_no_outlier.groupby('District')['Price'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(
    x=mean_price_per_district.values, y=mean_price_per_district.index,
    order=mean_price_per_district.index, palette='magma'
)
plt.title('Mean price per district')
plt.xlabel('Mean price (bilion VND)')
plt.ylabel('District')
plt.show()
In [96]:
g = sns.catplot(data=df_train_no_outlier, x='Type_of_housing', y='Price', palette='magma', col='Legal_document', col_wrap=2, kind='point')
g.set_axis_labels('Type of housing', 'Price (bilion VND)')
g.fig.suptitle('Price of each type of housing and legal document category', y=1.05)
g.tick_params(axis='x', rotation=45)
plt.show()
In [97]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_train_no_outlier, x='Number_of_floors', y='Price', palette='magma')
plt.title('Price of each number of floors')
plt.xlabel('Number of floors')
plt.ylabel('Price (bilion VND)')
plt.show()
In [98]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_train_no_outlier, x='Number_of_bedrooms', y='Price', palette='magma')
plt.title('Price of each number of bedrooms')
plt.xlabel('Number of bedrooms')
plt.ylabel('Price (bilion VND)')
plt.show()
In [99]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_train_no_outlier, x='Inner_city', y='Price', palette='magma')
plt.title('Price of inner city and suburb')
plt.xlabel('Inner city')
plt.ylabel('Price (bilion VND)')
plt.show()
In [100]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_train_no_outlier, x='Year', y='Price', palette='magma')
plt.title('Price of each year')
plt.xlabel('Year')
plt.ylabel('Price (bilion VND)')
plt.show()
In [116]:
plt.figure(figsize=(12, 8))
sns.heatmap(df_train_no_outlier.select_dtypes(include=['number']).corr(), cmap='magma', annot=True)
plt.title('Correlation matrix')
plt.show()

Xây dựng mô hình dự đoán giá nhà¶

In [136]:
X = df_train_no_outlier.copy()
y = X.pop('Price')
In [137]:
X.drop(['Date', 'Price_per_m2', 'Unit', 'Year', 'Month'], axis=1, inplace=True)

Baseline model¶

In [138]:
X.head()
Out[138]:
District Ward Type_of_housing Legal_document Number_of_floors Number_of_bedrooms Area Inner_city
0 Quận Hoàng Mai Phường Tương Mai Nhà ngõ, hẻm Đã có sổ Unknown 2 27.000000000 1
1 Quận Hà Đông Phường Yên Nghĩa Nhà ngõ, hẻm Đã có sổ Unknown 4 36.000000000 1
2 Quận Hà Đông Phường Vạn Phúc Nhà ngõ, hẻm Đã có sổ 3 2 25.000000000 1
3 Quận Hai Bà Trưng Phường Thanh Nhàn Nhà ngõ, hẻm Đã có sổ Unknown 4 38.000000000 1
5 Quận Đống Đa Phường Trung Liệt Nhà ngõ, hẻm Đã có sổ 4 4 40.000000000 1
In [139]:
X_baseline = X.select_dtypes(include=['number']).copy()
In [140]:
X_baseline.info()
<class 'pandas.core.frame.DataFrame'>
Index: 59601 entries, 0 to 65880
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Number_of_bedrooms  59601 non-null  int32  
 1   Area                59601 non-null  float64
 2   Inner_city          59601 non-null  int64  
dtypes: float64(1), int32(1), int64(1)
memory usage: 3.6 MB
In [141]:
baseline_classifier = Classifier()
baseline_classifier.fit(X_baseline, y)
100%|██████████| 5/5 [00:12<00:00,  2.48s/it]
In [142]:
baseline_summary = baseline_classifier.summary()
baseline_summary.sort_values(by=['Score RMSE', 'Std RMSE', 'Score MAE', 'Std MAE'], ascending=True)
Out[142]:
Score RMSE Std RMSE Score MAE Std MAE
XGBoost 1.170453051 0.004821788 1.369983594 0.011310147
Random Forest 1.175691598 0.004093763 1.382267494 0.009643962
Decision Tree 1.186835958 0.003005849 1.408588627 0.007142190
KNN 1.258648950 0.007125662 1.584247953 0.017975179
Linear Model 1.352226798 0.009125451 1.828600588 0.024758854

Encode categorical columns¶

Ordinal encode¶

In [143]:
X_ordinal = X.copy()
for col in X.select_dtypes(include=['category']):
    X_ordinal[col] = X_ordinal[col].cat.codes
In [144]:
ordinal_encode_classifier = Classifier()
ordinal_encode_classifier.fit(X_ordinal, y)
100%|██████████| 5/5 [01:10<00:00, 14.09s/it]
In [145]:
ordinal_encode_summary = ordinal_encode_classifier.summary()
ordinal_encode_summary.sort_values(by=['Score RMSE', 'Std RMSE', 'Score MAE', 'Std MAE'], ascending=True)
Out[145]:
Score RMSE Std RMSE Score MAE Std MAE
XGBoost 0.995290663 0.006043268 0.990640026 0.012016104
Random Forest 1.044308628 0.007126051 1.090631291 0.014896763
KNN 1.093720354 0.009900168 1.196322225 0.021584416
Decision Tree 1.295991342 0.016780967 1.679875160 0.043584059
Linear Model 1.305201795 0.005191121 1.703578673 0.013560540

Ordinal encoder + standard scale¶

In [146]:
scaler = StandardScaler()
In [147]:
X_scaled = pd.DataFrame(scaler.fit_transform(X_ordinal))
X_scaled.columns = X_ordinal.columns.astype(str)
In [148]:
scaled_classifier = Classifier()
scaled_classifier.fit(X_scaled, y)
100%|██████████| 5/5 [01:14<00:00, 14.87s/it]
In [149]:
scaled_summary = scaled_classifier.summary()
scaled_summary.sort_values(by=['Score RMSE', 'Std RMSE', 'Score MAE', 'Std MAE'], ascending=True)
Out[149]:
Score RMSE Std RMSE Score MAE Std MAE
XGBoost 0.995290663 0.006043268 0.990640026 0.012016104
Random Forest 1.044290611 0.006934076 1.090590961 0.014496976
KNN 1.105905520 0.004346702 1.223045912 0.009628786
Decision Tree 1.298102887 0.016022981 1.685327840 0.041646937
Linear Model 1.305201795 0.005191121 1.703578673 0.013560540

Select XGBoost for final model¶

Hyperparameter tuning¶

In [158]:
def objective(trial, X=X_scaled, y=y):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1),
        'n_estimators': trial.suggest_int('n_estimators', 2, 8000),
        'max_depth': trial.suggest_int('max_depth', 0, 20),
        'min_split_loss': trial.suggest_float('min_split_loss', 0, 20),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
    }

    kf = KFold(n_splits=5)
    scores = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model = XGBRegressor(**params, random_state=42, n_jobs=-1, tree_method='gpu_hist')
        model.fit(X_train, y_train, verbose=False)

        y_pred = model.predict(X_val)
        test_score = mean_squared_error(y_val, y_pred, squared=False)
        scores.append(test_score)
    
    return np.mean(scores)
In [159]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
[I 2023-07-15 00:48:25,186] A new study created in memory with name: no-name-14af0196-b0fc-486d-9581-ed9b6db9ea16
[I 2023-07-15 00:49:10,260] Trial 0 finished with value: 1.1250230572882367 and parameters: {'learning_rate': 0.9340149460286932, 'n_estimators': 4688, 'max_depth': 19, 'min_split_loss': 3.031056757066639, 'subsample': 0.7270562997299521}. Best is trial 0 with value: 1.1250230572882367.
[I 2023-07-15 00:49:55,908] Trial 1 finished with value: 1.0359117631637287 and parameters: {'learning_rate': 0.11283321785587012, 'n_estimators': 4637, 'max_depth': 6, 'min_split_loss': 15.002864008496006, 'subsample': 0.19642275580878366}. Best is trial 1 with value: 1.0359117631637287.
[I 2023-07-15 00:50:11,788] Trial 2 finished with value: 1.047764727745027 and parameters: {'learning_rate': 0.8852240216140673, 'n_estimators': 1713, 'max_depth': 11, 'min_split_loss': 11.697121458750612, 'subsample': 0.7412674904024358}. Best is trial 1 with value: 1.0359117631637287.
[I 2023-07-15 00:52:03,302] Trial 3 finished with value: 9.721233591854455e+17 and parameters: {'learning_rate': 0.7742775569644892, 'n_estimators': 6819, 'max_depth': 5, 'min_split_loss': 7.730907310680976, 'subsample': 0.13643928172009717}. Best is trial 1 with value: 1.0359117631637287.
[I 2023-07-15 00:53:07,295] Trial 4 finished with value: 1.0241379615929507 and parameters: {'learning_rate': 0.16044060031149873, 'n_estimators': 7426, 'max_depth': 16, 'min_split_loss': 14.750495240714216, 'subsample': 0.6347798721990907}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 00:53:18,395] Trial 5 finished with value: 1.0454803597770355 and parameters: {'learning_rate': 0.4517542123423569, 'n_estimators': 1077, 'max_depth': 18, 'min_split_loss': 8.743182453994736, 'subsample': 0.2845464496843517}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 00:54:18,411] Trial 6 finished with value: 1.0701166157124962 and parameters: {'learning_rate': 0.8832193974636876, 'n_estimators': 6327, 'max_depth': 20, 'min_split_loss': 8.594707040873338, 'subsample': 0.5589330639236176}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 00:57:41,794] Trial 7 finished with value: 3.137254665499745 and parameters: {'learning_rate': 0.36375723358156986, 'n_estimators': 7628, 'max_depth': 10, 'min_split_loss': 3.75827298356316, 'subsample': 0.11759291999871106}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 00:58:02,550] Trial 8 finished with value: 1.0314505702201653 and parameters: {'learning_rate': 0.041123957838678595, 'n_estimators': 1856, 'max_depth': 4, 'min_split_loss': 9.194424087900732, 'subsample': 0.12364982902843785}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 00:58:06,079] Trial 9 finished with value: 1.079149244087705 and parameters: {'learning_rate': 0.6001551636610668, 'n_estimators': 259, 'max_depth': 19, 'min_split_loss': 10.300397857808456, 'subsample': 0.2349476075621314}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 00:58:32,606] Trial 10 finished with value: 1.0326783681090064 and parameters: {'learning_rate': 0.2290102728528943, 'n_estimators': 3176, 'max_depth': 14, 'min_split_loss': 19.70439544086861, 'subsample': 0.9730772555535183}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 00:58:58,961] Trial 11 finished with value: 1.029049827524942 and parameters: {'learning_rate': 0.039014582516654434, 'n_estimators': 2768, 'max_depth': 0, 'min_split_loss': 14.627062515083827, 'subsample': 0.39506246391408517}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 00:59:30,264] Trial 12 finished with value: 1.0733049011401046 and parameters: {'learning_rate': 0.015212016837314546, 'n_estimators': 3563, 'max_depth': 1, 'min_split_loss': 15.933852468345224, 'subsample': 0.4102048899247697}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 01:00:19,906] Trial 13 finished with value: 1.0263106874106933 and parameters: {'learning_rate': 0.20549189455963696, 'n_estimators': 5574, 'max_depth': 15, 'min_split_loss': 13.79438979500648, 'subsample': 0.4224305605417833}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 01:01:10,795] Trial 14 finished with value: 1.0340909909599199 and parameters: {'learning_rate': 0.24299465903258768, 'n_estimators': 5872, 'max_depth': 15, 'min_split_loss': 18.940550620724398, 'subsample': 0.5535886760992442}. Best is trial 4 with value: 1.0241379615929507.
[I 2023-07-15 01:02:19,898] Trial 15 finished with value: 1.0240405015211635 and parameters: {'learning_rate': 0.22051529251451504, 'n_estimators': 7846, 'max_depth': 15, 'min_split_loss': 12.959959717967863, 'subsample': 0.42768968606330543}. Best is trial 15 with value: 1.0240405015211635.
[I 2023-07-15 01:03:26,027] Trial 16 finished with value: 1.022620890228835 and parameters: {'learning_rate': 0.3694835641389733, 'n_estimators': 7545, 'max_depth': 12, 'min_split_loss': 11.948908745654755, 'subsample': 0.6294281418297402}. Best is trial 16 with value: 1.022620890228835.
[I 2023-07-15 01:04:34,882] Trial 17 finished with value: 1.029749414425177 and parameters: {'learning_rate': 0.34343336605013475, 'n_estimators': 7749, 'max_depth': 11, 'min_split_loss': 12.51195831124374, 'subsample': 0.339322548543032}. Best is trial 16 with value: 1.022620890228835.
[I 2023-07-15 01:05:39,197] Trial 18 finished with value: 1.0396812474908448 and parameters: {'learning_rate': 0.5540989155273346, 'n_estimators': 6816, 'max_depth': 8, 'min_split_loss': 6.087588626728266, 'subsample': 0.48742242822042575}. Best is trial 16 with value: 1.022620890228835.
[I 2023-07-15 01:07:38,974] Trial 19 finished with value: 1.2864326317762615 and parameters: {'learning_rate': 0.33248834297821883, 'n_estimators': 5248, 'max_depth': 13, 'min_split_loss': 1.0882844670751357, 'subsample': 0.29499541218365094}. Best is trial 16 with value: 1.022620890228835.
[I 2023-07-15 01:08:53,072] Trial 20 finished with value: 1.0275731061290059 and parameters: {'learning_rate': 0.4523346709590344, 'n_estimators': 7989, 'max_depth': 8, 'min_split_loss': 11.137680358670742, 'subsample': 0.48841135983123224}. Best is trial 16 with value: 1.022620890228835.
[I 2023-07-15 01:09:58,067] Trial 21 finished with value: 1.0227916762340699 and parameters: {'learning_rate': 0.18654789453566237, 'n_estimators': 7023, 'max_depth': 17, 'min_split_loss': 13.005614503702065, 'subsample': 0.6296459208578176}. Best is trial 16 with value: 1.022620890228835.
[I 2023-07-15 01:11:02,746] Trial 22 finished with value: 1.0226765341200592 and parameters: {'learning_rate': 0.2578807506595512, 'n_estimators': 6843, 'max_depth': 17, 'min_split_loss': 12.63604301116292, 'subsample': 0.6421734434872026}. Best is trial 16 with value: 1.022620890228835.
[I 2023-07-15 01:12:06,817] Trial 23 finished with value: 1.0193920193436639 and parameters: {'learning_rate': 0.29983202746844295, 'n_estimators': 6779, 'max_depth': 17, 'min_split_loss': 11.037328570516278, 'subsample': 0.6492194452599176}. Best is trial 23 with value: 1.0193920193436639.
[I 2023-07-15 01:13:06,735] Trial 24 finished with value: 1.0175165564515563 and parameters: {'learning_rate': 0.30119234151757046, 'n_estimators': 6237, 'max_depth': 13, 'min_split_loss': 10.404607681423421, 'subsample': 0.737911680617044}. Best is trial 24 with value: 1.0175165564515563.
[I 2023-07-15 01:14:13,474] Trial 25 finished with value: 1.0203142420887672 and parameters: {'learning_rate': 0.31521958484645934, 'n_estimators': 6105, 'max_depth': 13, 'min_split_loss': 10.443909813039111, 'subsample': 0.807384566458903}. Best is trial 24 with value: 1.0175165564515563.
[I 2023-07-15 01:15:25,156] Trial 26 finished with value: 1.0187463703531017 and parameters: {'learning_rate': 0.29048587231903034, 'n_estimators': 6090, 'max_depth': 13, 'min_split_loss': 10.394126262307019, 'subsample': 0.8195850025765146}. Best is trial 24 with value: 1.0175165564515563.
[I 2023-07-15 01:16:18,064] Trial 27 finished with value: 1.017923130295856 and parameters: {'learning_rate': 0.12983502010790532, 'n_estimators': 4905, 'max_depth': 9, 'min_split_loss': 10.518427739277598, 'subsample': 0.8521056187222901}. Best is trial 24 with value: 1.0175165564515563.
[I 2023-07-15 01:17:04,992] Trial 28 finished with value: 1.0098563569094956 and parameters: {'learning_rate': 0.12067525078231542, 'n_estimators': 4518, 'max_depth': 8, 'min_split_loss': 6.445463775859403, 'subsample': 0.8636210349366354}. Best is trial 28 with value: 1.0098563569094956.
[I 2023-07-15 01:17:51,701] Trial 29 finished with value: 1.0132146217622997 and parameters: {'learning_rate': 0.11159133816808892, 'n_estimators': 4467, 'max_depth': 7, 'min_split_loss': 6.36688288813265, 'subsample': 0.9366288478751225}. Best is trial 28 with value: 1.0098563569094956.
[I 2023-07-15 01:18:37,256] Trial 30 finished with value: 1.0143454799835585 and parameters: {'learning_rate': 0.11217166997787245, 'n_estimators': 4206, 'max_depth': 7, 'min_split_loss': 4.992279568544612, 'subsample': 0.9854272546613878}. Best is trial 28 with value: 1.0098563569094956.
[I 2023-07-15 01:19:21,303] Trial 31 finished with value: 1.0170200992305882 and parameters: {'learning_rate': 0.1070832971161489, 'n_estimators': 4290, 'max_depth': 7, 'min_split_loss': 6.154461099974491, 'subsample': 0.984129361414134}. Best is trial 28 with value: 1.0098563569094956.
[I 2023-07-15 01:20:03,848] Trial 32 finished with value: 1.0222328735685084 and parameters: {'learning_rate': 0.09475948495954128, 'n_estimators': 4154, 'max_depth': 6, 'min_split_loss': 6.125911140874031, 'subsample': 0.9868978377736272}. Best is trial 28 with value: 1.0098563569094956.
[I 2023-07-15 01:20:48,247] Trial 33 finished with value: 1.0097206079001837 and parameters: {'learning_rate': 0.11112362122684236, 'n_estimators': 4250, 'max_depth': 7, 'min_split_loss': 4.959636604897126, 'subsample': 0.9256851903968756}. Best is trial 33 with value: 1.0097206079001837.
[I 2023-07-15 01:21:36,508] Trial 34 finished with value: 1.0267892029444212 and parameters: {'learning_rate': 0.0699900978778992, 'n_estimators': 4666, 'max_depth': 3, 'min_split_loss': 4.470054199089831, 'subsample': 0.9165472615036583}. Best is trial 33 with value: 1.0097206079001837.
[I 2023-07-15 01:22:15,725] Trial 35 finished with value: 1.0006454078819218 and parameters: {'learning_rate': 0.15602739711959696, 'n_estimators': 3777, 'max_depth': 6, 'min_split_loss': 2.553403704155068, 'subsample': 0.9050045831818262}. Best is trial 35 with value: 1.0006454078819218.
[I 2023-07-15 01:22:53,882] Trial 36 finished with value: 1.0165328650868681 and parameters: {'learning_rate': 0.15969729261563623, 'n_estimators': 3684, 'max_depth': 3, 'min_split_loss': 2.4813345869220433, 'subsample': 0.9108137929471368}. Best is trial 35 with value: 1.0006454078819218.
[I 2023-07-15 01:23:22,642] Trial 37 finished with value: 1.0040858024307284 and parameters: {'learning_rate': 0.16123920628446634, 'n_estimators': 2706, 'max_depth': 5, 'min_split_loss': 2.590050463911777, 'subsample': 0.8955031934150768}. Best is trial 35 with value: 1.0006454078819218.
[I 2023-07-15 01:24:03,767] Trial 38 finished with value: 1.0072469480458364 and parameters: {'learning_rate': 0.011938547045589526, 'n_estimators': 2666, 'max_depth': 5, 'min_split_loss': 2.464363476709972, 'subsample': 0.8794936396900467}. Best is trial 35 with value: 1.0006454078819218.
[I 2023-07-15 01:24:40,795] Trial 39 finished with value: 1.0005241085422676 and parameters: {'learning_rate': 0.02158400884618652, 'n_estimators': 2463, 'max_depth': 5, 'min_split_loss': 2.481010928913032, 'subsample': 0.7815738989084675}. Best is trial 39 with value: 1.0005241085422676.
[I 2023-07-15 01:25:32,359] Trial 40 finished with value: 0.9867814716492548 and parameters: {'learning_rate': 0.02127626658874382, 'n_estimators': 2402, 'max_depth': 5, 'min_split_loss': 0.059859296927641026, 'subsample': 0.796871793912701}. Best is trial 40 with value: 0.9867814716492548.
[I 2023-07-15 01:26:24,242] Trial 41 finished with value: 1.0447800093385133 and parameters: {'learning_rate': 0.001905029948673937, 'n_estimators': 2378, 'max_depth': 5, 'min_split_loss': 0.20254392106935004, 'subsample': 0.7737188043619777}. Best is trial 40 with value: 0.9867814716492548.
[I 2023-07-15 01:26:47,555] Trial 42 finished with value: 1.0107410520171554 and parameters: {'learning_rate': 0.04527123404079844, 'n_estimators': 1852, 'max_depth': 4, 'min_split_loss': 2.4595722669765703, 'subsample': 0.8722020829307042}. Best is trial 40 with value: 0.9867814716492548.
[I 2023-07-15 01:27:02,648] Trial 43 finished with value: 1.0733952502417126 and parameters: {'learning_rate': 0.008311422623737963, 'n_estimators': 1195, 'max_depth': 2, 'min_split_loss': 1.6116375351087768, 'subsample': 0.7858693082451123}. Best is trial 40 with value: 0.9867814716492548.
[I 2023-07-15 01:27:30,989] Trial 44 finished with value: 0.99965987220119 and parameters: {'learning_rate': 0.16099655377995886, 'n_estimators': 2461, 'max_depth': 5, 'min_split_loss': 3.446829994461001, 'subsample': 0.7056409871798088}. Best is trial 40 with value: 0.9867814716492548.
[I 2023-07-15 01:28:06,950] Trial 45 finished with value: 1.0021019053304596 and parameters: {'learning_rate': 0.1873522892725119, 'n_estimators': 3191, 'max_depth': 4, 'min_split_loss': 3.597130300110563, 'subsample': 0.6958327167879609}. Best is trial 40 with value: 0.9867814716492548.
[I 2023-07-15 01:28:47,309] Trial 46 finished with value: 1.010854853146226 and parameters: {'learning_rate': 0.06871949933805205, 'n_estimators': 3264, 'max_depth': 3, 'min_split_loss': 3.419803788302134, 'subsample': 0.6999422731119689}. Best is trial 40 with value: 0.9867814716492548.
[I 2023-07-15 01:29:08,174] Trial 47 finished with value: 1.0515739375257804 and parameters: {'learning_rate': 0.18287881537069992, 'n_estimators': 2063, 'max_depth': 1, 'min_split_loss': 0.22095667184025558, 'subsample': 0.7287991216996565}. Best is trial 40 with value: 0.9867814716492548.
[I 2023-07-15 01:29:26,525] Trial 48 finished with value: 0.985571140902044 and parameters: {'learning_rate': 0.07166352247086288, 'n_estimators': 916, 'max_depth': 6, 'min_split_loss': 1.460125278512008, 'subsample': 0.6918741618870508}. Best is trial 48 with value: 0.985571140902044.
[I 2023-07-15 01:29:28,826] Trial 49 finished with value: 1.02801538552088 and parameters: {'learning_rate': 0.0601965132232841, 'n_estimators': 79, 'max_depth': 6, 'min_split_loss': 1.2052580560466357, 'subsample': 0.757989408906537}. Best is trial 48 with value: 0.985571140902044.
[I 2023-07-15 01:29:46,481] Trial 50 finished with value: 0.9857485427882411 and parameters: {'learning_rate': 0.07534786639502536, 'n_estimators': 833, 'max_depth': 10, 'min_split_loss': 1.1352479886133482, 'subsample': 0.8012380070367774}. Best is trial 48 with value: 0.985571140902044.
[I 2023-07-15 01:30:05,969] Trial 51 finished with value: 0.9852294071913719 and parameters: {'learning_rate': 0.0623792106765998, 'n_estimators': 1067, 'max_depth': 10, 'min_split_loss': 1.4739270835510108, 'subsample': 0.8264160834439783}. Best is trial 51 with value: 0.9852294071913719.
[I 2023-07-15 01:31:05,067] Trial 52 finished with value: 1.043425773315227 and parameters: {'learning_rate': 0.06578146948854052, 'n_estimators': 700, 'max_depth': 10, 'min_split_loss': 0.01626197256004369, 'subsample': 0.8233667814637629}. Best is trial 51 with value: 0.9852294071913719.
[I 2023-07-15 01:31:29,686] Trial 53 finished with value: 0.9855264873940573 and parameters: {'learning_rate': 0.04642202318094034, 'n_estimators': 1276, 'max_depth': 11, 'min_split_loss': 1.6432430825074236, 'subsample': 0.7951611677108088}. Best is trial 51 with value: 0.9852294071913719.
[I 2023-07-15 01:31:50,976] Trial 54 finished with value: 0.9884958984285251 and parameters: {'learning_rate': 0.07448140139422202, 'n_estimators': 1288, 'max_depth': 11, 'min_split_loss': 1.3796095262604915, 'subsample': 0.8421112567628396}. Best is trial 51 with value: 0.9852294071913719.
[I 2023-07-15 01:32:11,538] Trial 55 finished with value: 0.9879628449677224 and parameters: {'learning_rate': 0.06817895540854302, 'n_estimators': 1332, 'max_depth': 11, 'min_split_loss': 1.596649410113837, 'subsample': 0.8456828004390725}. Best is trial 51 with value: 0.9852294071913719.
[I 2023-07-15 01:32:32,746] Trial 56 finished with value: 0.9828191051223344 and parameters: {'learning_rate': 0.04274355817443308, 'n_estimators': 636, 'max_depth': 10, 'min_split_loss': 0.8730979465721159, 'subsample': 0.7981246371818187}. Best is trial 56 with value: 0.9828191051223344.
[I 2023-07-15 01:33:02,042] Trial 57 finished with value: 1.5738447981495312 and parameters: {'learning_rate': 0.002025192828594473, 'n_estimators': 528, 'max_depth': 9, 'min_split_loss': 0.776195852473508, 'subsample': 0.7881767101143484}. Best is trial 56 with value: 0.9828191051223344.
[I 2023-07-15 01:33:32,190] Trial 58 finished with value: 0.9817220664220863 and parameters: {'learning_rate': 0.0352503008817555, 'n_estimators': 843, 'max_depth': 9, 'min_split_loss': 0.7051507470196597, 'subsample': 0.7570879315608724}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:33:43,436] Trial 59 finished with value: 0.996986903643284 and parameters: {'learning_rate': 0.24140240755779466, 'n_estimators': 799, 'max_depth': 9, 'min_split_loss': 1.8080789610487416, 'subsample': 0.7593026179226889}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:34:20,764] Trial 60 finished with value: 0.9842429946248668 and parameters: {'learning_rate': 0.04619486889991346, 'n_estimators': 1627, 'max_depth': 10, 'min_split_loss': 0.7339779170893937, 'subsample': 0.820303680542043}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:34:53,426] Trial 61 finished with value: 0.9830426863794399 and parameters: {'learning_rate': 0.04536915789885921, 'n_estimators': 1612, 'max_depth': 10, 'min_split_loss': 0.9562466407177583, 'subsample': 0.8201894362718462}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:35:12,642] Trial 62 finished with value: 0.9872486661041238 and parameters: {'learning_rate': 0.04558156990765409, 'n_estimators': 432, 'max_depth': 12, 'min_split_loss': 0.8977097491717667, 'subsample': 0.8205467328709517}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:35:41,950] Trial 63 finished with value: 0.9857509038667803 and parameters: {'learning_rate': 0.03924078493523759, 'n_estimators': 1551, 'max_depth': 12, 'min_split_loss': 1.877392396599614, 'subsample': 0.7500165947889181}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:35:58,889] Trial 64 finished with value: 0.990309698898083 and parameters: {'learning_rate': 0.14151083500749037, 'n_estimators': 992, 'max_depth': 9, 'min_split_loss': 0.8054798021181345, 'subsample': 0.8404123267844018}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:36:22,047] Trial 65 finished with value: 0.9881995446631844 and parameters: {'learning_rate': 0.09087404411222127, 'n_estimators': 1524, 'max_depth': 10, 'min_split_loss': 1.9477944331070915, 'subsample': 0.7132966998015592}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:36:27,609] Trial 66 finished with value: 1.012833603498521 and parameters: {'learning_rate': 0.03793431124437039, 'n_estimators': 90, 'max_depth': 10, 'min_split_loss': 3.0823335532283296, 'subsample': 0.6788882872430195}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:37:13,584] Trial 67 finished with value: 1.0072817962784941 and parameters: {'learning_rate': 0.1219276083880741, 'n_estimators': 2130, 'max_depth': 8, 'min_split_loss': 0.6644219364967744, 'subsample': 0.7301561923316304}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:37:22,329] Trial 68 finished with value: 0.9996948924775833 and parameters: {'learning_rate': 0.21372465507862431, 'n_estimators': 387, 'max_depth': 11, 'min_split_loss': 0.6942611545775678, 'subsample': 0.8717841357388378}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:37:36,053] Trial 69 finished with value: 0.9975621972398028 and parameters: {'learning_rate': 0.08925495162996694, 'n_estimators': 1012, 'max_depth': 12, 'min_split_loss': 3.952309703142766, 'subsample': 0.819530433375911}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:37:58,931] Trial 70 finished with value: 0.990128957931453 and parameters: {'learning_rate': 0.133716621445778, 'n_estimators': 1567, 'max_depth': 11, 'min_split_loss': 1.9304236769745624, 'subsample': 0.7619814171608162}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:38:14,046] Trial 71 finished with value: 0.9878951215959251 and parameters: {'learning_rate': 0.0882226225147237, 'n_estimators': 740, 'max_depth': 10, 'min_split_loss': 1.3621441840617863, 'subsample': 0.7819903521707278}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:38:45,921] Trial 72 finished with value: 0.9877383311534944 and parameters: {'learning_rate': 0.03692993461213871, 'n_estimators': 946, 'max_depth': 14, 'min_split_loss': 1.0995259805684645, 'subsample': 0.8027637439263728}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:39:48,605] Trial 73 finished with value: 1.0079064563789848 and parameters: {'learning_rate': 0.05056381815443322, 'n_estimators': 1731, 'max_depth': 9, 'min_split_loss': 0.42276709535705737, 'subsample': 0.7345003151323343}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:40:05,774] Trial 74 finished with value: 0.9962532057982179 and parameters: {'learning_rate': 0.09168381439733939, 'n_estimators': 1358, 'max_depth': 8, 'min_split_loss': 3.0424287320912393, 'subsample': 0.8336761365365748}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:40:15,268] Trial 75 finished with value: 0.9901712725033137 and parameters: {'learning_rate': 0.12886100372073178, 'n_estimators': 522, 'max_depth': 10, 'min_split_loss': 1.2670517793548681, 'subsample': 0.8826459498201815}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:40:51,593] Trial 76 finished with value: 0.9849675429771774 and parameters: {'learning_rate': 0.029682257643949578, 'n_estimators': 2017, 'max_depth': 10, 'min_split_loss': 1.9434081209021867, 'subsample': 0.8085995637929262}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:42:09,716] Trial 77 finished with value: 0.9883576126900279 and parameters: {'learning_rate': 0.006730323534764271, 'n_estimators': 2105, 'max_depth': 12, 'min_split_loss': 2.0770918193683316, 'subsample': 0.8578144982152517}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:42:29,525] Trial 78 finished with value: 0.9991212491419802 and parameters: {'learning_rate': 0.024407131724201645, 'n_estimators': 1192, 'max_depth': 8, 'min_split_loss': 2.8245525817940837, 'subsample': 0.9415650025258018}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:43:16,777] Trial 79 finished with value: 1.018391519700326 and parameters: {'learning_rate': 0.10829300946587989, 'n_estimators': 1754, 'max_depth': 14, 'min_split_loss': 0.5066105809873449, 'subsample': 0.751832527173802}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:43:26,069] Trial 80 finished with value: 0.9924252903053492 and parameters: {'learning_rate': 0.042684224872409283, 'n_estimators': 259, 'max_depth': 9, 'min_split_loss': 2.287780059565791, 'subsample': 0.8150400098519095}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:43:40,024] Trial 81 finished with value: 0.9878862477003298 and parameters: {'learning_rate': 0.07687062418514995, 'n_estimators': 628, 'max_depth': 11, 'min_split_loss': 1.5539513142977572, 'subsample': 0.7993929861049305}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:44:08,991] Trial 82 finished with value: 0.9850690856318363 and parameters: {'learning_rate': 0.05031122437273722, 'n_estimators': 858, 'max_depth': 10, 'min_split_loss': 0.7443087942906031, 'subsample': 0.7734751595973343}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:44:52,033] Trial 83 finished with value: 0.9865185075065576 and parameters: {'learning_rate': 0.03523819600185106, 'n_estimators': 1107, 'max_depth': 9, 'min_split_loss': 0.5137678206684175, 'subsample': 0.7749295885779146}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:47:45,880] Trial 84 finished with value: 1.1324884176088423 and parameters: {'learning_rate': 0.14205412716510096, 'n_estimators': 1967, 'max_depth': 11, 'min_split_loss': 0.0320450447459919, 'subsample': 0.8548775838071648}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:48:11,916] Trial 85 finished with value: 0.9862341350969233 and parameters: {'learning_rate': 0.05700106097179835, 'n_estimators': 1444, 'max_depth': 12, 'min_split_loss': 1.630958817653421, 'subsample': 0.6745261036160605}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:48:18,122] Trial 86 finished with value: 0.9945970154657763 and parameters: {'learning_rate': 0.0979817801904496, 'n_estimators': 334, 'max_depth': 13, 'min_split_loss': 3.1165638257399473, 'subsample': 0.7422447760772253}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:48:35,937] Trial 87 finished with value: 0.9889931356457031 and parameters: {'learning_rate': 0.02327675913226224, 'n_estimators': 880, 'max_depth': 8, 'min_split_loss': 2.1278206082132667, 'subsample': 0.7732142783999499}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:49:07,650] Trial 88 finished with value: 1.073338794751443 and parameters: {'learning_rate': 0.002194878123015623, 'n_estimators': 1178, 'max_depth': 7, 'min_split_loss': 1.009073111626619, 'subsample': 0.8921291879004906}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:49:36,934] Trial 89 finished with value: 0.9949138054278974 and parameters: {'learning_rate': 0.1779082820064046, 'n_estimators': 2950, 'max_depth': 10, 'min_split_loss': 2.618834171555604, 'subsample': 0.8322394887936497}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:50:22,487] Trial 90 finished with value: 1.035854282377787 and parameters: {'learning_rate': 0.11299905027727267, 'n_estimators': 1645, 'max_depth': 10, 'min_split_loss': 0.39769952756594407, 'subsample': 0.7201992908489232}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:50:37,779] Trial 91 finished with value: 0.9853075509687159 and parameters: {'learning_rate': 0.07054332518594787, 'n_estimators': 835, 'max_depth': 10, 'min_split_loss': 1.0737952109042717, 'subsample': 0.7987574503132816}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:50:51,339] Trial 92 finished with value: 0.9876333129627197 and parameters: {'learning_rate': 0.06269463603139169, 'n_estimators': 640, 'max_depth': 11, 'min_split_loss': 1.4046012186591608, 'subsample': 0.8033772089556366}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:51:16,223] Trial 93 finished with value: 0.9817868262671456 and parameters: {'learning_rate': 0.02635586748983021, 'n_estimators': 1016, 'max_depth': 9, 'min_split_loss': 0.8459590010615032, 'subsample': 0.8588504990219831}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:52:40,622] Trial 94 finished with value: 1.013111784698826 and parameters: {'learning_rate': 0.025021231905325014, 'n_estimators': 1895, 'max_depth': 9, 'min_split_loss': 0.05089282122203509, 'subsample': 0.853785961145654}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:52:41,919] Trial 95 finished with value: 1.6568490306270687 and parameters: {'learning_rate': 0.053106451658147306, 'n_estimators': 18, 'max_depth': 9, 'min_split_loss': 0.8170561462701083, 'subsample': 0.8671133903662211}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:52:57,317] Trial 96 finished with value: 0.9907788232035337 and parameters: {'learning_rate': 0.0965432502011007, 'n_estimators': 1354, 'max_depth': 8, 'min_split_loss': 2.177470734810511, 'subsample': 0.8260143620893109}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:53:23,794] Trial 97 finished with value: 0.9840680262939046 and parameters: {'learning_rate': 0.019684759636642, 'n_estimators': 1095, 'max_depth': 10, 'min_split_loss': 1.0016576143615739, 'subsample': 0.8880238692511361}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:53:51,506] Trial 98 finished with value: 0.9844782625157771 and parameters: {'learning_rate': 0.019006072731166467, 'n_estimators': 1071, 'max_depth': 10, 'min_split_loss': 1.0159825587016684, 'subsample': 0.9002364561341822}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:54:36,632] Trial 99 finished with value: 0.9818140627716758 and parameters: {'learning_rate': 0.016099426185867526, 'n_estimators': 1069, 'max_depth': 10, 'min_split_loss': 0.3989862321628616, 'subsample': 0.903666307882585}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:54:47,252] Trial 100 finished with value: 2.8655948120710812 and parameters: {'learning_rate': 0.0011619162028382417, 'n_estimators': 209, 'max_depth': 9, 'min_split_loss': 0.4268950911247093, 'subsample': 0.899967673652287}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:55:09,968] Trial 101 finished with value: 0.9854267577062077 and parameters: {'learning_rate': 0.026288463461064802, 'n_estimators': 1053, 'max_depth': 10, 'min_split_loss': 0.7163367162548059, 'subsample': 0.9574826927407236}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:57:53,556] Trial 102 finished with value: 1.0739978703965376 and parameters: {'learning_rate': 0.036115692241569894, 'n_estimators': 2280, 'max_depth': 10, 'min_split_loss': 0.007464562408978526, 'subsample': 0.9155304388984612}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:58:21,660] Trial 103 finished with value: 0.9857794638633717 and parameters: {'learning_rate': 0.016328757699380955, 'n_estimators': 585, 'max_depth': 11, 'min_split_loss': 1.0033039503135401, 'subsample': 0.8870102328964689}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:58:38,860] Trial 104 finished with value: 0.9921286570191048 and parameters: {'learning_rate': 0.052159355262039936, 'n_estimators': 1465, 'max_depth': 9, 'min_split_loss': 1.9230038921885622, 'subsample': 0.9302733959481475}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:59:00,238] Trial 105 finished with value: 0.9867037072528255 and parameters: {'learning_rate': 0.08354095483512394, 'n_estimators': 1119, 'max_depth': 7, 'min_split_loss': 0.3922893775580994, 'subsample': 0.8668825412853656}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:59:30,833] Trial 106 finished with value: 0.9860838784406359 and parameters: {'learning_rate': 0.025569735299711044, 'n_estimators': 1762, 'max_depth': 11, 'min_split_loss': 1.185192920036491, 'subsample': 0.9033499592672538}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 01:59:54,499] Trial 107 finished with value: 1.9955550313883983 and parameters: {'learning_rate': 0.0016146737049965061, 'n_estimators': 432, 'max_depth': 10, 'min_split_loss': 2.2321038779048945, 'subsample': 0.8398054940904587}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:00:02,201] Trial 108 finished with value: 0.9931189024122029 and parameters: {'learning_rate': 0.11113374319439492, 'n_estimators': 781, 'max_depth': 8, 'min_split_loss': 1.7058313137884868, 'subsample': 0.9152073607019526}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:00:15,568] Trial 109 finished with value: 0.9834925842524956 and parameters: {'learning_rate': 0.05355887195268705, 'n_estimators': 979, 'max_depth': 8, 'min_split_loss': 0.6536001060751624, 'subsample': 0.9504045507762464}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:00:26,771] Trial 110 finished with value: 0.9877008529551073 and parameters: {'learning_rate': 0.1426419246352929, 'n_estimators': 1414, 'max_depth': 8, 'min_split_loss': 0.6939309895910983, 'subsample': 0.9680084311541025}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:00:35,896] Trial 111 finished with value: 0.9904565505815454 and parameters: {'learning_rate': 0.057719468400706395, 'n_estimators': 972, 'max_depth': 9, 'min_split_loss': 1.393245681614709, 'subsample': 0.9529702681225117}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:00:52,430] Trial 112 finished with value: 0.9896300902533197 and parameters: {'learning_rate': 0.07206307095750554, 'n_estimators': 1139, 'max_depth': 10, 'min_split_loss': 0.4228964615973754, 'subsample': 0.8834240553057091}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:01:02,915] Trial 113 finished with value: 0.9850147721368889 and parameters: {'learning_rate': 0.03373342712287175, 'n_estimators': 673, 'max_depth': 9, 'min_split_loss': 0.8883728897729676, 'subsample': 0.9255998301436194}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:01:12,923] Trial 114 finished with value: 0.9931797880148903 and parameters: {'learning_rate': 0.029974942284780336, 'n_estimators': 729, 'max_depth': 9, 'min_split_loss': 0.9794566388124962, 'subsample': 0.9966526480718052}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:01:25,966] Trial 115 finished with value: 1.0020337009340279 and parameters: {'learning_rate': 0.08811065097200757, 'n_estimators': 1619, 'max_depth': 7, 'min_split_loss': 2.7709092086879155, 'subsample': 0.9298137326693846}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:02:26,560] Trial 116 finished with value: 1.0133951659450917 and parameters: {'learning_rate': 0.01849440850830293, 'n_estimators': 487, 'max_depth': 20, 'min_split_loss': 0.2817241588261763, 'subsample': 0.9439088172519341}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:02:37,745] Trial 117 finished with value: 0.9895400858807939 and parameters: {'learning_rate': 0.053139039243796884, 'n_estimators': 1286, 'max_depth': 9, 'min_split_loss': 1.8701906091917961, 'subsample': 0.9098440066408429}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:02:46,868] Trial 118 finished with value: 0.9947007640922267 and parameters: {'learning_rate': 0.10452105071685289, 'n_estimators': 930, 'max_depth': 12, 'min_split_loss': 0.7525303181753932, 'subsample': 0.9674363404725203}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:02:49,908] Trial 119 finished with value: 0.9905150522567363 and parameters: {'learning_rate': 0.12341538219065402, 'n_estimators': 209, 'max_depth': 8, 'min_split_loss': 1.2361864426182174, 'subsample': 0.8788682058258462}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:02:58,531] Trial 120 finished with value: 0.9931504327901873 and parameters: {'learning_rate': 0.03494898728306246, 'n_estimators': 645, 'max_depth': 10, 'min_split_loss': 2.3457987775459483, 'subsample': 0.9289775890431488}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:03:11,164] Trial 121 finished with value: 0.986643269962298 and parameters: {'learning_rate': 0.056385073748408794, 'n_estimators': 1241, 'max_depth': 11, 'min_split_loss': 1.5650133001179154, 'subsample': 0.8159661763627252}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:03:23,171] Trial 122 finished with value: 0.9837107513014344 and parameters: {'learning_rate': 0.07753843957201939, 'n_estimators': 1037, 'max_depth': 9, 'min_split_loss': 0.897928665633718, 'subsample': 0.8508931696698934}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:04:06,352] Trial 123 finished with value: 0.9853290499641126 and parameters: {'learning_rate': 0.08404648402896081, 'n_estimators': 5131, 'max_depth': 9, 'min_split_loss': 0.8524745113329135, 'subsample': 0.862427761669835}. Best is trial 58 with value: 0.9817220664220863.
[I 2023-07-15 02:04:25,512] Trial 124 finished with value: 0.9800658843117906 and parameters: {'learning_rate': 0.017848344318085723, 'n_estimators': 806, 'max_depth': 8, 'min_split_loss': 0.08543909708097697, 'subsample': 0.8513687064854506}. Best is trial 124 with value: 0.9800658843117906.
[I 2023-07-15 02:05:02,098] Trial 125 finished with value: 0.9810636974275532 and parameters: {'learning_rate': 0.019965001804488063, 'n_estimators': 1501, 'max_depth': 8, 'min_split_loss': 0.12184865420780544, 'subsample': 0.8462731737694353}. Best is trial 124 with value: 0.9800658843117906.
[I 2023-07-15 02:05:41,439] Trial 126 finished with value: 1.0818058253917238 and parameters: {'learning_rate': 0.0015896389551994378, 'n_estimators': 1490, 'max_depth': 8, 'min_split_loss': 0.28197157639886516, 'subsample': 0.8552823831975327}. Best is trial 124 with value: 0.9800658843117906.
[I 2023-07-15 02:06:16,912] Trial 127 finished with value: 0.9797872003864752 and parameters: {'learning_rate': 0.021446884778389415, 'n_estimators': 1944, 'max_depth': 7, 'min_split_loss': 0.019428984914023173, 'subsample': 0.9006079039611383}. Best is trial 127 with value: 0.9797872003864752.
[I 2023-07-15 02:06:45,375] Trial 128 finished with value: 0.9837436588670521 and parameters: {'learning_rate': 0.01670258073950271, 'n_estimators': 1855, 'max_depth': 6, 'min_split_loss': 0.19455706480905643, 'subsample': 0.8939732425399636}. Best is trial 127 with value: 0.9797872003864752.
[I 2023-07-15 02:07:28,407] Trial 129 finished with value: 1.0205071960293532 and parameters: {'learning_rate': 0.0019277072204525597, 'n_estimators': 2209, 'max_depth': 7, 'min_split_loss': 0.1037997113225364, 'subsample': 0.8386371044909969}. Best is trial 127 with value: 0.9797872003864752.
[I 2023-07-15 02:07:52,727] Trial 130 finished with value: 0.9849196932349666 and parameters: {'learning_rate': 0.07478861558257775, 'n_estimators': 1777, 'max_depth': 6, 'min_split_loss': 0.35414847882468836, 'subsample': 0.8860232672089934}. Best is trial 127 with value: 0.9797872003864752.
[I 2023-07-15 02:08:22,884] Trial 131 finished with value: 0.9788187798805768 and parameters: {'learning_rate': 0.021198546238726898, 'n_estimators': 1631, 'max_depth': 7, 'min_split_loss': 0.001675880790564889, 'subsample': 0.8996878314239521}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:08:54,626] Trial 132 finished with value: 0.9852252690992813 and parameters: {'learning_rate': 0.04821662429645872, 'n_estimators': 1997, 'max_depth': 6, 'min_split_loss': 0.09163587238298343, 'subsample': 0.864235694320586}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:09:33,203] Trial 133 finished with value: 0.9798777876240239 and parameters: {'learning_rate': 0.023218198018731893, 'n_estimators': 2557, 'max_depth': 7, 'min_split_loss': 0.5334123000665054, 'subsample': 0.8435136190137457}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:10:06,571] Trial 134 finished with value: 0.9789620688611353 and parameters: {'learning_rate': 0.017826426464341683, 'n_estimators': 1866, 'max_depth': 7, 'min_split_loss': 0.05182765447186031, 'subsample': 0.8471804042305389}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:10:37,473] Trial 135 finished with value: 0.9911341646092666 and parameters: {'learning_rate': 0.07287196258921044, 'n_estimators': 2524, 'max_depth': 7, 'min_split_loss': 0.44932761070987104, 'subsample': 0.8463810567767734}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:11:09,069] Trial 136 finished with value: 0.9835895115753335 and parameters: {'learning_rate': 0.016048154306224027, 'n_estimators': 1913, 'max_depth': 6, 'min_split_loss': 0.004126045788471533, 'subsample': 0.8728786930469056}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:11:47,780] Trial 137 finished with value: 0.9900288307708056 and parameters: {'learning_rate': 0.09598578037919064, 'n_estimators': 2987, 'max_depth': 7, 'min_split_loss': 0.5652227070582754, 'subsample': 0.8506230408358606}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:12:57,650] Trial 138 finished with value: 1.0107706699023296 and parameters: {'learning_rate': 0.03923535119312692, 'n_estimators': 3462, 'max_depth': 7, 'min_split_loss': 0.0637502361798836, 'subsample': 0.8730627472492233}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:13:16,706] Trial 139 finished with value: 0.9865684780242205 and parameters: {'learning_rate': 0.060149198216984084, 'n_estimators': 1523, 'max_depth': 6, 'min_split_loss': 1.3705775569214453, 'subsample': 0.8313989778008666}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:14:09,978] Trial 140 finished with value: 1.1296318393459421 and parameters: {'learning_rate': 0.2697667522046535, 'n_estimators': 2205, 'max_depth': 8, 'min_split_loss': 0.02167525344148913, 'subsample': 0.9503567388834131}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:14:40,109] Trial 141 finished with value: 0.9879800113057244 and parameters: {'learning_rate': 0.012414464576794215, 'n_estimators': 1872, 'max_depth': 6, 'min_split_loss': 0.489046553820074, 'subsample': 0.9051237999225561}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:15:13,653] Trial 142 finished with value: 1.1460375889866723 and parameters: {'learning_rate': 0.0012269131689471545, 'n_estimators': 1648, 'max_depth': 7, 'min_split_loss': 0.010763591575780829, 'subsample': 0.8753287212096135}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:15:33,722] Trial 143 finished with value: 0.9964384417198181 and parameters: {'learning_rate': 0.030776607688334953, 'n_estimators': 1826, 'max_depth': 5, 'min_split_loss': 1.231965981261637, 'subsample': 0.8976075038129429}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:16:21,389] Trial 144 finished with value: 0.9796785561569262 and parameters: {'learning_rate': 0.017530574786411707, 'n_estimators': 2601, 'max_depth': 8, 'min_split_loss': 0.5178012801949297, 'subsample': 0.8438166456782153}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:16:53,690] Trial 145 finished with value: 0.9812483677271997 and parameters: {'learning_rate': 0.042382162949761035, 'n_estimators': 2837, 'max_depth': 8, 'min_split_loss': 0.76387949321864, 'subsample': 0.833745839159148}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:17:34,062] Trial 146 finished with value: 0.9872885989452291 and parameters: {'learning_rate': 0.043856070486223434, 'n_estimators': 2802, 'max_depth': 8, 'min_split_loss': 0.557483043117087, 'subsample': 0.8109960305745081}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:18:04,632] Trial 147 finished with value: 0.9851471084638291 and parameters: {'learning_rate': 0.022384502347670775, 'n_estimators': 2710, 'max_depth': 7, 'min_split_loss': 1.6959473088562875, 'subsample': 0.793849405886234}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:18:22,961] Trial 148 finished with value: 1.0170708280423866 and parameters: {'learning_rate': 0.05399468456864731, 'n_estimators': 2243, 'max_depth': 8, 'min_split_loss': 9.639199318925222, 'subsample': 0.842263335922061}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:18:47,621] Trial 149 finished with value: 0.9854400699426014 and parameters: {'learning_rate': 0.10282572963835643, 'n_estimators': 2519, 'max_depth': 8, 'min_split_loss': 1.2636897577433999, 'subsample': 0.8272980928159386}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:19:09,827] Trial 150 finished with value: 0.9814578218324801 and parameters: {'learning_rate': 0.037609631978307544, 'n_estimators': 2044, 'max_depth': 7, 'min_split_loss': 0.7239497313748172, 'subsample': 0.9152171064352252}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:19:32,294] Trial 151 finished with value: 1.0010871552892486 and parameters: {'learning_rate': 0.3527749806578899, 'n_estimators': 2416, 'max_depth': 7, 'min_split_loss': 0.7348850288489828, 'subsample': 0.9243596085997248}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:19:48,408] Trial 152 finished with value: 1.0152270727746922 and parameters: {'learning_rate': 0.038725732939502586, 'n_estimators': 2016, 'max_depth': 7, 'min_split_loss': 7.610900306696605, 'subsample': 0.8692470866579396}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:21:03,802] Trial 153 finished with value: 0.9958071448569028 and parameters: {'learning_rate': 0.0021203814062356816, 'n_estimators': 3023, 'max_depth': 8, 'min_split_loss': 0.623507401049691, 'subsample': 0.9404787278472703}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:21:26,336] Trial 154 finished with value: 0.9840222885753362 and parameters: {'learning_rate': 0.06907097735200993, 'n_estimators': 2616, 'max_depth': 7, 'min_split_loss': 0.4249726475279946, 'subsample': 0.9729112447809233}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:22:03,848] Trial 155 finished with value: 0.9801557068799103 and parameters: {'learning_rate': 0.02669858817415144, 'n_estimators': 2305, 'max_depth': 6, 'min_split_loss': 0.014547695920964887, 'subsample': 0.9099040946817646}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:22:24,371] Trial 156 finished with value: 0.9852123865669208 and parameters: {'learning_rate': 0.04497129634025513, 'n_estimators': 2366, 'max_depth': 8, 'min_split_loss': 1.1263373827259995, 'subsample': 0.9143670001722435}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:22:45,023] Trial 157 finished with value: 1.019883179503075 and parameters: {'learning_rate': 0.46766325895078015, 'n_estimators': 2174, 'max_depth': 7, 'min_split_loss': 1.5906773924937214, 'subsample': 0.782274736128951}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:23:33,603] Trial 158 finished with value: 0.981309589131673 and parameters: {'learning_rate': 0.028402589927782128, 'n_estimators': 3381, 'max_depth': 6, 'min_split_loss': 0.5719568874990546, 'subsample': 0.8214090093919307}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:24:15,353] Trial 159 finished with value: 0.9836536156437393 and parameters: {'learning_rate': 0.0301025621778959, 'n_estimators': 3873, 'max_depth': 6, 'min_split_loss': 1.1820795225377472, 'subsample': 0.8311230562818391}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:24:52,404] Trial 160 finished with value: 1.0228631401117592 and parameters: {'learning_rate': 0.4166857489322021, 'n_estimators': 2884, 'max_depth': 4, 'min_split_loss': 0.016464554018017427, 'subsample': 0.8116314225843189}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:25:27,657] Trial 161 finished with value: 0.9849297415641509 and parameters: {'learning_rate': 0.06613471778944639, 'n_estimators': 3394, 'max_depth': 8, 'min_split_loss': 0.6080614550918705, 'subsample': 0.8582164340089042}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:26:11,951] Trial 162 finished with value: 0.9902507553019714 and parameters: {'learning_rate': 0.019158915783852, 'n_estimators': 3238, 'max_depth': 5, 'min_split_loss': 0.8630977643402893, 'subsample': 0.8919509964761129}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:26:54,376] Trial 163 finished with value: 0.986868428470651 and parameters: {'learning_rate': 0.04908221135464257, 'n_estimators': 2607, 'max_depth': 7, 'min_split_loss': 0.4609459220614766, 'subsample': 0.8361537480618144}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:27:22,239] Trial 164 finished with value: 0.9924471814037903 and parameters: {'learning_rate': 0.08457567666757781, 'n_estimators': 3101, 'max_depth': 6, 'min_split_loss': 1.5032650829292686, 'subsample': 0.913649461844643}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:28:08,514] Trial 165 finished with value: 0.9805720805972353 and parameters: {'learning_rate': 0.031141262147961486, 'n_estimators': 3651, 'max_depth': 8, 'min_split_loss': 0.9312573812947609, 'subsample': 0.809023789588202}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:28:37,970] Trial 166 finished with value: 0.9856578322155469 and parameters: {'learning_rate': 0.02438392926127141, 'n_estimators': 2824, 'max_depth': 8, 'min_split_loss': 1.98737417303825, 'subsample': 0.7893050216373221}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:29:22,589] Trial 167 finished with value: 1.0618192741665706 and parameters: {'learning_rate': 0.6763657244778115, 'n_estimators': 3956, 'max_depth': 6, 'min_split_loss': 1.0005836892456663, 'subsample': 0.8144665758662195}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:31:09,008] Trial 168 finished with value: 0.9807905604623628 and parameters: {'learning_rate': 0.002898234878979923, 'n_estimators': 3328, 'max_depth': 9, 'min_split_loss': 0.3626971124811102, 'subsample': 0.84718533596314}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:32:34,871] Trial 169 finished with value: 1.0389343187619486 and parameters: {'learning_rate': 0.001197985971605687, 'n_estimators': 2306, 'max_depth': 9, 'min_split_loss': 0.430660421158596, 'subsample': 0.8506857644999021}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:33:13,333] Trial 170 finished with value: 0.9819961185152218 and parameters: {'learning_rate': 0.0294234147042384, 'n_estimators': 3290, 'max_depth': 7, 'min_split_loss': 1.4302293011783584, 'subsample': 0.7524281999087825}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:34:22,698] Trial 171 finished with value: 0.9918040171904818 and parameters: {'learning_rate': 0.02652447359968849, 'n_estimators': 3787, 'max_depth': 7, 'min_split_loss': 0.40080543744428304, 'subsample': 0.7618011981014767}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:34:55,824] Trial 172 finished with value: 1.0060279363853541 and parameters: {'learning_rate': 0.019565369509885883, 'n_estimators': 3592, 'max_depth': 7, 'min_split_loss': 5.421480340928618, 'subsample': 0.801169339112689}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:35:35,783] Trial 173 finished with value: 0.9814792082157053 and parameters: {'learning_rate': 0.042673494381435, 'n_estimators': 3325, 'max_depth': 7, 'min_split_loss': 1.2766518220148233, 'subsample': 0.7646486885095204}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:36:08,582] Trial 174 finished with value: 0.9829914601095296 and parameters: {'learning_rate': 0.06549678890407644, 'n_estimators': 2988, 'max_depth': 7, 'min_split_loss': 1.3431078588312289, 'subsample': 0.7625958099579455}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:37:41,357] Trial 175 finished with value: 1.048504459340423 and parameters: {'learning_rate': 0.04215281656461217, 'n_estimators': 3540, 'max_depth': 8, 'min_split_loss': 0.05263735260592828, 'subsample': 0.7341702929211695}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:38:18,251] Trial 176 finished with value: 0.9950769733065652 and parameters: {'learning_rate': 0.013599990073578011, 'n_estimators': 3287, 'max_depth': 6, 'min_split_loss': 1.6922747866439065, 'subsample': 0.881373344516492}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:38:58,108] Trial 177 finished with value: 0.979571966000444 and parameters: {'learning_rate': 0.037098199174815674, 'n_estimators': 3580, 'max_depth': 8, 'min_split_loss': 0.9321785187119271, 'subsample': 0.8393507679254153}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:40:27,018] Trial 178 finished with value: 0.9882918375210054 and parameters: {'learning_rate': 0.0023268926696196866, 'n_estimators': 3609, 'max_depth': 8, 'min_split_loss': 0.771892645102837, 'subsample': 0.8631867535117235}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:41:54,899] Trial 179 finished with value: 1.1632179762971622 and parameters: {'learning_rate': 0.2352920215047299, 'n_estimators': 3413, 'max_depth': 8, 'min_split_loss': 0.013815429342460935, 'subsample': 0.8419878606918069}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:42:48,455] Trial 180 finished with value: 1.0057738412767623 and parameters: {'learning_rate': 0.08108364535225762, 'n_estimators': 4072, 'max_depth': 9, 'min_split_loss': 1.0165424235887315, 'subsample': 0.5976935603786311}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:43:33,912] Trial 181 finished with value: 0.9840692669508233 and parameters: {'learning_rate': 0.0395266166666831, 'n_estimators': 3234, 'max_depth': 7, 'min_split_loss': 0.5650547947204869, 'subsample': 0.8225413250099559}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:44:13,393] Trial 182 finished with value: 1.0041645884440877 and parameters: {'learning_rate': 0.32391596410177514, 'n_estimators': 3709, 'max_depth': 6, 'min_split_loss': 1.270201495477907, 'subsample': 0.7905940129668612}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:45:03,498] Trial 183 finished with value: 1.0661250344700712 and parameters: {'learning_rate': 0.20140083939421025, 'n_estimators': 3133, 'max_depth': 8, 'min_split_loss': 0.3482399787285235, 'subsample': 0.7482679270159486}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:45:47,977] Trial 184 finished with value: 0.9808711892474555 and parameters: {'learning_rate': 0.02863270988114047, 'n_estimators': 3301, 'max_depth': 7, 'min_split_loss': 0.8664023453353575, 'subsample': 0.7716326608239756}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:46:22,556] Trial 185 finished with value: 0.9872637701336091 and parameters: {'learning_rate': 0.0658674646645438, 'n_estimators': 3350, 'max_depth': 5, 'min_split_loss': 0.9510600262836223, 'subsample': 0.8847359976264186}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:47:10,553] Trial 186 finished with value: 0.9847180653902103 and parameters: {'learning_rate': 0.02175896700012613, 'n_estimators': 2718, 'max_depth': 9, 'min_split_loss': 0.4357924827723051, 'subsample': 0.8551959539077207}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:47:39,400] Trial 187 finished with value: 0.9884874801364525 and parameters: {'learning_rate': 0.05508019544614058, 'n_estimators': 3494, 'max_depth': 8, 'min_split_loss': 2.1848932920770263, 'subsample': 0.8338871087056515}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:48:42,489] Trial 188 finished with value: 1.0187714022399064 and parameters: {'learning_rate': 0.001651230905401039, 'n_estimators': 3791, 'max_depth': 6, 'min_split_loss': 0.7908457152992395, 'subsample': 0.9005391701735688}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:49:39,407] Trial 189 finished with value: 0.9957964243641984 and parameters: {'learning_rate': 0.03614055349829534, 'n_estimators': 3120, 'max_depth': 7, 'min_split_loss': 0.37811952257389164, 'subsample': 0.7748913355347935}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:49:59,535] Trial 190 finished with value: 0.9830147013677554 and parameters: {'learning_rate': 0.06097899424720434, 'n_estimators': 2075, 'max_depth': 7, 'min_split_loss': 1.0719745991338163, 'subsample': 0.8685796523714542}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:50:40,813] Trial 191 finished with value: 0.9819752399234789 and parameters: {'learning_rate': 0.028126802270683583, 'n_estimators': 3614, 'max_depth': 7, 'min_split_loss': 1.4872301977138065, 'subsample': 0.7249301817635183}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:51:18,560] Trial 192 finished with value: 0.9846176936738752 and parameters: {'learning_rate': 0.020112367295666636, 'n_estimators': 3667, 'max_depth': 7, 'min_split_loss': 1.6360957368977467, 'subsample': 0.802069993734477}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:52:16,091] Trial 193 finished with value: 0.9907963883965614 and parameters: {'learning_rate': 0.038808263916358526, 'n_estimators': 3983, 'max_depth': 8, 'min_split_loss': 0.7308284838890843, 'subsample': 0.7251342969981598}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:53:22,032] Trial 194 finished with value: 0.9915674589305461 and parameters: {'learning_rate': 0.02356563294804082, 'n_estimators': 3452, 'max_depth': 7, 'min_split_loss': 0.023096681098909222, 'subsample': 0.8251648989188601}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:53:57,200] Trial 195 finished with value: 0.996433601885405 and parameters: {'learning_rate': 0.04997731634985496, 'n_estimators': 4293, 'max_depth': 9, 'min_split_loss': 4.384498407569398, 'subsample': 0.7106614559002431}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:54:22,121] Trial 196 finished with value: 1.0235665813114139 and parameters: {'learning_rate': 0.09130717377982381, 'n_estimators': 2877, 'max_depth': 6, 'min_split_loss': 10.970788625286996, 'subsample': 0.7795944328947799}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:55:06,431] Trial 197 finished with value: 1.0015426668001606 and parameters: {'learning_rate': 0.0034322372406441048, 'n_estimators': 2497, 'max_depth': 8, 'min_split_loss': 3.7774898604106584, 'subsample': 0.8512881489935011}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:55:37,324] Trial 198 finished with value: 1.014894549886684 and parameters: {'learning_rate': 0.03218880584756105, 'n_estimators': 3752, 'max_depth': 8, 'min_split_loss': 8.844438786608148, 'subsample': 0.8123148259127336}. Best is trial 131 with value: 0.9788187798805768.
[I 2023-07-15 02:56:22,817] Trial 199 finished with value: 1.0504749617040825 and parameters: {'learning_rate': 0.2938791537817739, 'n_estimators': 3574, 'max_depth': 7, 'min_split_loss': 0.36361795320808443, 'subsample': 0.8347515969997998}. Best is trial 131 with value: 0.9788187798805768.
Number of finished trials: 200
Best trial: {'learning_rate': 0.021198546238726898, 'n_estimators': 1631, 'max_depth': 7, 'min_split_loss': 0.001675880790564889, 'subsample': 0.8996878314239521}
In [160]:
params = {'learning_rate': 0.021198546238726898, 'n_estimators': 1631, 'max_depth': 7, 'min_split_loss': 0.001675880790564889, 'subsample': 0.8996878314239521}
In [161]:
model = XGBRegressor(**params, random_state=42, n_jobs=-1, tree_method='gpu_hist')
model.fit(X_scaled, y)
Out[161]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.021198546238726898,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=7, max_leaves=None,
             min_child_weight=None, min_split_loss=0.001675880790564889,
             missing=nan, monotone_constraints=None, n_estimators=1631,
             n_jobs=-1, num_parallel_tree=None, predictor=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.021198546238726898,
             max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=7, max_leaves=None,
             min_child_weight=None, min_split_loss=0.001675880790564889,
             missing=nan, monotone_constraints=None, n_estimators=1631,
             n_jobs=-1, num_parallel_tree=None, predictor=None, ...)

Processing test-set¶

In [163]:
X_test = df_test.copy()
y_test = X_test.pop('Price')
In [165]:
X_test.drop(['Date', 'Price_per_m2', 'Unit', 'Year', 'Month'], axis=1, inplace=True)
In [166]:
X_ordinal_test = X_test.copy()
for col in X_test.select_dtypes(include=['category']):
    X_ordinal_test[col] = X_ordinal_test[col].cat.codes
In [167]:
X_scaled_test = pd.DataFrame(scaler.transform(X_ordinal_test))
X_scaled_test.columns = X_ordinal_test.columns.astype(str)

Testing¶

In [168]:
y_pred = model.predict(X_scaled_test)
print('RMSE score:', mean_squared_error(y_test, y_pred, squared=False))
print('MAE score:', mean_absolute_error(y_test, y_pred))
RMSE score: 78.57697097217158
MAE score: 3.2443219022505043

Summary¶

In [169]:
df_pred = pd.DataFrame({
    'y_pred': y_pred,
    'y_test': y_test,
    'error': abs(y_pred - y_test)
})
In [170]:
df_pred.head()
Out[170]:
y_pred y_test error
0 4.298893452 4.000000000 0.298893452
1 5.150403023 32.799900000 27.649496977
2 2.843826056 2.600150000 0.243676056
3 4.944609165 5.499900000 0.555290835
4 5.546931744 5.200000000 0.346931744
In [172]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df_pred, x='y_test', y='y_pred', alpha=0.7, color='#F38181')
plt.title('Predicted price vs actual price')
plt.xlabel('Actual price (bilion VND)')
plt.ylabel('Predicted price (bilion VND)')
plt.show()
In [174]:
# Top 10 houses with highest error

df_pred.sort_values(by='error', ascending=False).head(10)
Out[174]:
y_pred y_test error
9823 4.979243279 6200.000000000 6195.020756721
12576 6.040565968 5656.555556000 5650.514990032
4989 4.467294216 3649.998860000 3645.531565784
614 4.337861538 3099.999969000 3095.662107462
8140 4.715719223 2649.998520000 2645.282800777
10166 7.372033596 679.999999980 672.627966384
10475 5.228681087 390.000000000 384.771318913
314 4.114246845 310.000500000 305.886253155
11427 4.289424896 309.997800000 305.708375104
6979 3.896793365 299.999999988 296.103206623
In [176]:
# Only see prediction which not outlier

iqr = df_pred['y_test'].quantile(0.75) - df_pred['y_test'].quantile(0.25)
upper_bound = df_pred['y_test'].quantile(0.75) + 1.5 * iqr
lower_bound = df_pred['y_test'].quantile(0.25) - 1.5 * iqr

df_pred_no_outlier = df_pred[(df_pred['y_test'] < upper_bound) & (df_pred['y_test'] > lower_bound)]
In [177]:
lim = (-0.2, 9.2)

plt.figure(figsize=(12, 8))
sns.scatterplot(data=df_pred_no_outlier, x='y_test', y='y_pred', alpha=0.5, color='#F38181')
plt.xlim(*lim)
plt.ylim(*lim)
plt.plot(lim, lim, color='red', linestyle='--')
plt.title('Predicted price vs actual price (no outlier)')
plt.xlabel('Actual price (bilion VND)')
plt.ylabel('Predicted price (bilion VND)')
plt.show()
In [178]:
def plot_pred_actual(col, col_wrap=4):
    lim = (-0.2, 9.2)
    g = sns.relplot(data=df_pred_no_outlier, x='y_test', y='y_pred', alpha=0.7, color='#F38181', col=X_test[col], kind='scatter', col_wrap=col_wrap)
    for ax in g.axes.flat:
        ax.set_xlim(*lim)
        ax.set_ylim(*lim)
        ax.plot(lim, lim, color='red', linestyle='--')
    g.fig.suptitle('Predicted price vs actual price (no outlier)', y=1.05)
    g.set_axis_labels('Actual price (bilion VND)', 'Predicted price (bilion VND)')
    plt.show()
In [180]:
plot_pred_actual('Type_of_housing')
In [181]:
plot_pred_actual('Inner_city', col_wrap=2)
In [182]:
plot_pred_actual('Number_of_floors')
In [183]:
plot_pred_actual('Number_of_bedrooms')
In [186]:
plot_pred_actual('District')
In [ ]: